Merge pull request #9 from iOSDevSK/main

The issue with the server crash has been solved.
This commit is contained in:
intergalacticalvariable 2024-10-04 17:45:35 +02:00 committed by GitHub
commit d5eee95175
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 198 additions and 82 deletions

View File

@ -651,7 +651,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
} }
} catch (error) { } catch (error) {
console.log('Invalid URL:', urlToCrawl, error); console.log('Invalid URL:', urlToCrawl, error);
return sendResponse(res, 'Invalid URL', { contentType: 'text/plain', envelope: null, code: 400 }); return sendResponse(res, 'Invalid URL or TLD', { contentType: 'text/plain', envelope: null, code: 400 });
} }
// Prevent circular crawling // Prevent circular crawling
@ -663,17 +663,34 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
let lastScrapped: PageSnapshot | undefined; let lastScrapped: PageSnapshot | undefined;
for await (const scrapped of this.scrap(parsedUrl, crawlOpts, crawlerOptions)) { try {
lastScrapped = scrapped; for await (const scrapped of this.scrap(parsedUrl, crawlOpts, crawlerOptions)) {
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) { lastScrapped = scrapped;
continue; if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, parsedUrl);
if (crawlerOptions.timeout === undefined) {
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
}
} }
} catch (scrapError: any) {
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, parsedUrl); console.error('Error during scraping:', scrapError);
if (scrapError instanceof AssertionFailureError &&
if (crawlerOptions.timeout === undefined) { (scrapError.message.includes('Invalid TLD') || scrapError.message.includes('ERR_NAME_NOT_RESOLVED'))) {
const errorSnapshot: PageSnapshot = {
title: 'Error: Invalid domain or TLD',
href: parsedUrl.toString(),
html: '',
text: `Failed to access the page due to an invalid domain or TLD: ${parsedUrl.toString()}`,
error: 'Invalid domain or TLD'
};
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, errorSnapshot, parsedUrl);
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith); return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
} }
throw scrapError; // Re-throw if it's not a handled error
} }
if (!lastScrapped) { if (!lastScrapped) {

View File

@ -17,9 +17,16 @@ app.use('/instant-screenshots', express.static(path.join('/app', 'local-storage'
app.all('*', async (req, res) => { app.all('*', async (req, res) => {
try { try {
await crawlerHost.crawl(req, res); await crawlerHost.crawl(req, res);
} catch (error) { } catch (error: any) {
console.error('Error during crawl:', error); console.error('Error during crawl:', error);
res.status(500).json({ error: 'An error occurred during the crawl' });
// Kontrola typu chyby
if (error.message.includes('Invalid TLD')) {
res.status(400).json({ error: 'Invalid URL or TLD' });
} else {
// Ošetrenie iných chýb
res.status(500).json({ error: 'An error occurred during the crawl' });
}
} }
}); });

View File

@ -63,6 +63,7 @@ export interface PageSnapshot {
maxElemDepth?: number; maxElemDepth?: number;
elemCount?: number; elemCount?: number;
childFrames?: PageSnapshot[]; childFrames?: PageSnapshot[];
error?: string;
} }
export interface ExtendedSnapshot extends PageSnapshot { export interface ExtendedSnapshot extends PageSnapshot {
@ -316,6 +317,17 @@ export class PuppeteerControl extends AsyncService {
this.logger.warn(`Browser killed`); this.logger.warn(`Browser killed`);
} }
private extractDomain(url: string): string {
try {
const { hostname } = new URL(url);
const parts = hostname.split('.');
return parts.length > 1 ? parts.slice(-2).join('.') : hostname;
} catch (error: any) {
this.logger.warn(`Failed to extract domain from URL: ${url}. Error: ${error.message}`);
return url;
}
}
async newPage() { async newPage() {
await this.serviceReady(); await this.serviceReady();
const dedicatedContext = await this.browser.createBrowserContext(); const dedicatedContext = await this.browser.createBrowserContext();
@ -355,8 +367,15 @@ export class PuppeteerControl extends AsyncService {
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') { if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
return req.abort('blockedbyclient', 1000); return req.abort('blockedbyclient', 1000);
} }
const tldParsed = tldExtract(requestUrl);
domainSet.add(tldParsed.domain); try {
const tldParsed = tldExtract(requestUrl);
domainSet.add(tldParsed.domain);
} catch (error) {
this.logger.warn(`Failed to parse TLD for URL: ${requestUrl}. Using fallback method.`);
const simpleDomain = this.extractDomain(requestUrl);
domainSet.add(simpleDomain);
}
const parsedUrl = new URL(requestUrl); const parsedUrl = new URL(requestUrl);
@ -547,15 +566,29 @@ document.addEventListener('load', handlePageLoad);
const timeout = options?.timeoutMs || 30_000; const timeout = options?.timeoutMs || 30_000;
const gotoPromise = page.goto(url, { try {
waitUntil: ['load', 'domcontentloaded', 'networkidle0'], let waitForPromise: Promise<any> | undefined;
timeout, let gotoPromise: Promise<PageSnapshot | void>;
})
.catch((err) => { gotoPromise = page.goto(url, {
if (err instanceof TimeoutError) { waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) }); timeout,
})
.catch((err: any) => {
if (err instanceof TimeoutError || err.message.includes('ERR_NAME_NOT_RESOLVED')) {
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) });
return {
title: 'Error: Unable to access page',
href: url,
html: '',
text: `Failed to access the page: ${err.message}`,
error: err.message
} as PageSnapshot;
}
if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) {
this.logger.warn(`Page ${sn}: Invalid domain or TLD for ${url}`, { err: marshalErrorLike(err) });
return new AssertionFailureError({ return new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`, message: `Invalid domain or TLD for ${url}: ${err}`,
cause: err, cause: err,
}); });
} }
@ -622,76 +655,135 @@ document.addEventListener('load', handlePageLoad);
} }
}); });
let waitForPromise: Promise<any> | undefined; if (options?.waitForSelector) {
if (options?.waitForSelector) { console.log('Waiting for selector', options.waitForSelector);
console.log('Waiting for selector', options.waitForSelector); const t0 = Date.now();
const t0 = Date.now(); waitForPromise = nextSnapshotDeferred.promise.then(() => {
waitForPromise = nextSnapshotDeferred.promise.then(() => { const t1 = Date.now();
const t1 = Date.now(); const elapsed = t1 - t0;
const elapsed = t1 - t0; const remaining = timeout - elapsed;
const remaining = timeout - elapsed; const thisTimeout = remaining > 100 ? remaining : 100;
const thisTimeout = remaining > 100 ? remaining : 100; const p = (Array.isArray(options.waitForSelector) ?
const p = (Array.isArray(options.waitForSelector) ? Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) : page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout })) .then(async () => {
.then(async () => { const pSubFrameSnapshots = this.snapshotChildFrames(page);
const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
finalized = true;
})
.catch((err) => {
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
waitForPromise = undefined;
});
return p as any;
});
}
try {
let lastHTML = snapshot?.html;
while (true) {
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
if (waitForPromise) {
ckpt.push(waitForPromise);
}
if (options?.minIntervalMs) {
ckpt.push(delay(options.minIntervalMs));
}
let error;
await Promise.race(ckpt).catch((err) => {
if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) {
this.logger.warn(`Invalid domain or TLD encountered: ${err.message}`);
error = new AssertionFailureError({
message: `Invalid domain or TLD for ${url}: ${err.message}`,
cause: err,
});
} else {
error = err;
}
});
if (finalized && !error) {
if (!snapshot && !screenshot) {
if (error) {
throw error;
}
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
}
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
break;
}
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = await page.screenshot(); screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true }); pageshot = await page.screenshot({ fullPage: true });
if (snapshot) { lastHTML = snapshot.html;
snapshot.childFrames = await pSubFrameSnapshots; }
} if (snapshot || screenshot) {
finalized = true; yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
}) }
.catch((err) => { if (error) {
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) }); if (error instanceof AssertionFailureError &&
waitForPromise = undefined; (error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) {
}); this.logger.warn(`Continuing despite Invalid domain or TLD: ${error.message}`);
return p as any; yield {
}); title: '',
} href: url,
html: '',
try { text: '',
let lastHTML = snapshot?.html; screenshot,
while (true) { pageshot,
const ckpt = [nextSnapshotDeferred.promise, gotoPromise]; error: 'Invalid domain or TLD'
if (waitForPromise) { } as PageSnapshot;
ckpt.push(waitForPromise); break;
} } else {
if (options?.minIntervalMs) {
ckpt.push(delay(options.minIntervalMs));
}
let error;
await Promise.race(ckpt).catch((err) => error = err);
if (finalized && !error) {
if (!snapshot && !screenshot) {
if (error) {
throw error; throw error;
} }
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
} }
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
break;
} }
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { } catch (error: any) {
screenshot = await page.screenshot(); if (error.message && (error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) {
pageshot = await page.screenshot({ fullPage: true }); this.logger.warn(`Invalid domain or TLD encountered: ${error.message}`);
lastHTML = snapshot.html; yield {
} title: '',
if (snapshot || screenshot) { href: url,
yield { ...snapshot, screenshot, pageshot } as PageSnapshot; html: '',
} text: '',
if (error) { screenshot,
pageshot,
error: 'Invalid domain or TLD'
} as PageSnapshot;
} else {
throw error; throw error;
} }
} finally {
if (typeof waitForPromise !== 'undefined' && typeof gotoPromise !== 'undefined') {
Promise.allSettled([gotoPromise, waitForPromise]).finally(() => {
page.off('snapshot', hdl);
this.ditchPage(page);
});
} else if (typeof gotoPromise !== 'undefined') {
gotoPromise.finally(() => {
page.off('snapshot', hdl);
this.ditchPage(page);
});
} else {
page.off('snapshot', hdl);
this.ditchPage(page);
}
nextSnapshotDeferred.resolve();
} }
} finally { } catch (error: any) {
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => { this.logger.error(`Unhandled error in scrap method:`, error);
page.off('snapshot', hdl); yield {
this.ditchPage(page); title: 'Error: Unhandled exception',
}); href: url,
nextSnapshotDeferred.resolve(); html: '',
text: `An unexpected error occurred: ${error.message}`,
error: 'Unhandled exception'
} as PageSnapshot;
} }
} }