The issue with the server crash has been solved, if the user made a mistake in the URL.

This commit is contained in:
Filip Dvoran 2024-10-01 14:41:19 +02:00
parent 008e00940b
commit e7dc0366bf
3 changed files with 198 additions and 82 deletions

View File

@ -651,7 +651,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}
} catch (error) {
console.log('Invalid URL:', urlToCrawl, error);
return sendResponse(res, 'Invalid URL', { contentType: 'text/plain', envelope: null, code: 400 });
return sendResponse(res, 'Invalid URL or TLD', { contentType: 'text/plain', envelope: null, code: 400 });
}
// Prevent circular crawling
@ -663,17 +663,34 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
let lastScrapped: PageSnapshot | undefined;
for await (const scrapped of this.scrap(parsedUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
try {
for await (const scrapped of this.scrap(parsedUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, parsedUrl);
if (crawlerOptions.timeout === undefined) {
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
}
}
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, parsedUrl);
if (crawlerOptions.timeout === undefined) {
} catch (scrapError: any) {
console.error('Error during scraping:', scrapError);
if (scrapError instanceof AssertionFailureError &&
(scrapError.message.includes('Invalid TLD') || scrapError.message.includes('ERR_NAME_NOT_RESOLVED'))) {
const errorSnapshot: PageSnapshot = {
title: 'Error: Invalid domain or TLD',
href: parsedUrl.toString(),
html: '',
text: `Failed to access the page due to an invalid domain or TLD: ${parsedUrl.toString()}`,
error: 'Invalid domain or TLD'
};
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, errorSnapshot, parsedUrl);
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
}
throw scrapError; // Re-throw if it's not a handled error
}
if (!lastScrapped) {

View File

@ -17,9 +17,16 @@ app.use('/instant-screenshots', express.static(path.join('/app', 'local-storage'
app.all('*', async (req, res) => {
try {
await crawlerHost.crawl(req, res);
} catch (error) {
} catch (error: any) {
console.error('Error during crawl:', error);
res.status(500).json({ error: 'An error occurred during the crawl' });
// Kontrola typu chyby
if (error.message.includes('Invalid TLD')) {
res.status(400).json({ error: 'Invalid URL or TLD' });
} else {
// Ošetrenie iných chýb
res.status(500).json({ error: 'An error occurred during the crawl' });
}
}
});

View File

@ -63,6 +63,7 @@ export interface PageSnapshot {
maxElemDepth?: number;
elemCount?: number;
childFrames?: PageSnapshot[];
error?: string;
}
export interface ExtendedSnapshot extends PageSnapshot {
@ -316,6 +317,17 @@ export class PuppeteerControl extends AsyncService {
this.logger.warn(`Browser killed`);
}
private extractDomain(url: string): string {
try {
const { hostname } = new URL(url);
const parts = hostname.split('.');
return parts.length > 1 ? parts.slice(-2).join('.') : hostname;
} catch (error: any) {
this.logger.warn(`Failed to extract domain from URL: ${url}. Error: ${error.message}`);
return url;
}
}
async newPage() {
await this.serviceReady();
const dedicatedContext = await this.browser.createBrowserContext();
@ -355,8 +367,15 @@ export class PuppeteerControl extends AsyncService {
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
return req.abort('blockedbyclient', 1000);
}
const tldParsed = tldExtract(requestUrl);
domainSet.add(tldParsed.domain);
try {
const tldParsed = tldExtract(requestUrl);
domainSet.add(tldParsed.domain);
} catch (error) {
this.logger.warn(`Failed to parse TLD for URL: ${requestUrl}. Using fallback method.`);
const simpleDomain = this.extractDomain(requestUrl);
domainSet.add(simpleDomain);
}
const parsedUrl = new URL(requestUrl);
@ -547,15 +566,29 @@ document.addEventListener('load', handlePageLoad);
const timeout = options?.timeoutMs || 30_000;
const gotoPromise = page.goto(url, {
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
timeout,
})
.catch((err) => {
if (err instanceof TimeoutError) {
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) });
try {
let waitForPromise: Promise<any> | undefined;
let gotoPromise: Promise<PageSnapshot | void>;
gotoPromise = page.goto(url, {
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
timeout,
})
.catch((err: any) => {
if (err instanceof TimeoutError || err.message.includes('ERR_NAME_NOT_RESOLVED')) {
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) });
return {
title: 'Error: Unable to access page',
href: url,
html: '',
text: `Failed to access the page: ${err.message}`,
error: err.message
} as PageSnapshot;
}
if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) {
this.logger.warn(`Page ${sn}: Invalid domain or TLD for ${url}`, { err: marshalErrorLike(err) });
return new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`,
message: `Invalid domain or TLD for ${url}: ${err}`,
cause: err,
});
}
@ -622,76 +655,135 @@ document.addEventListener('load', handlePageLoad);
}
});
let waitForPromise: Promise<any> | undefined;
if (options?.waitForSelector) {
console.log('Waiting for selector', options.waitForSelector);
const t0 = Date.now();
waitForPromise = nextSnapshotDeferred.promise.then(() => {
const t1 = Date.now();
const elapsed = t1 - t0;
const remaining = timeout - elapsed;
const thisTimeout = remaining > 100 ? remaining : 100;
const p = (Array.isArray(options.waitForSelector) ?
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
.then(async () => {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
if (options?.waitForSelector) {
console.log('Waiting for selector', options.waitForSelector);
const t0 = Date.now();
waitForPromise = nextSnapshotDeferred.promise.then(() => {
const t1 = Date.now();
const elapsed = t1 - t0;
const remaining = timeout - elapsed;
const thisTimeout = remaining > 100 ? remaining : 100;
const p = (Array.isArray(options.waitForSelector) ?
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
.then(async () => {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
finalized = true;
})
.catch((err) => {
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
waitForPromise = undefined;
});
return p as any;
});
}
try {
let lastHTML = snapshot?.html;
while (true) {
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
if (waitForPromise) {
ckpt.push(waitForPromise);
}
if (options?.minIntervalMs) {
ckpt.push(delay(options.minIntervalMs));
}
let error;
await Promise.race(ckpt).catch((err) => {
if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) {
this.logger.warn(`Invalid domain or TLD encountered: ${err.message}`);
error = new AssertionFailureError({
message: `Invalid domain or TLD for ${url}: ${err.message}`,
cause: err,
});
} else {
error = err;
}
});
if (finalized && !error) {
if (!snapshot && !screenshot) {
if (error) {
throw error;
}
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
}
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
break;
}
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
finalized = true;
})
.catch((err) => {
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
waitForPromise = undefined;
});
return p as any;
});
}
try {
let lastHTML = snapshot?.html;
while (true) {
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
if (waitForPromise) {
ckpt.push(waitForPromise);
}
if (options?.minIntervalMs) {
ckpt.push(delay(options.minIntervalMs));
}
let error;
await Promise.race(ckpt).catch((err) => error = err);
if (finalized && !error) {
if (!snapshot && !screenshot) {
if (error) {
lastHTML = snapshot.html;
}
if (snapshot || screenshot) {
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
}
if (error) {
if (error instanceof AssertionFailureError &&
(error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) {
this.logger.warn(`Continuing despite Invalid domain or TLD: ${error.message}`);
yield {
title: '',
href: url,
html: '',
text: '',
screenshot,
pageshot,
error: 'Invalid domain or TLD'
} as PageSnapshot;
break;
} else {
throw error;
}
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
}
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
break;
}
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
lastHTML = snapshot.html;
}
if (snapshot || screenshot) {
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
}
if (error) {
} catch (error: any) {
if (error.message && (error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) {
this.logger.warn(`Invalid domain or TLD encountered: ${error.message}`);
yield {
title: '',
href: url,
html: '',
text: '',
screenshot,
pageshot,
error: 'Invalid domain or TLD'
} as PageSnapshot;
} else {
throw error;
}
} finally {
if (typeof waitForPromise !== 'undefined' && typeof gotoPromise !== 'undefined') {
Promise.allSettled([gotoPromise, waitForPromise]).finally(() => {
page.off('snapshot', hdl);
this.ditchPage(page);
});
} else if (typeof gotoPromise !== 'undefined') {
gotoPromise.finally(() => {
page.off('snapshot', hdl);
this.ditchPage(page);
});
} else {
page.off('snapshot', hdl);
this.ditchPage(page);
}
nextSnapshotDeferred.resolve();
}
} finally {
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
page.off('snapshot', hdl);
this.ditchPage(page);
});
nextSnapshotDeferred.resolve();
} catch (error: any) {
this.logger.error(`Unhandled error in scrap method:`, error);
yield {
title: 'Error: Unhandled exception',
href: url,
html: '',
text: `An unexpected error occurred: ${error.message}`,
error: 'Unhandled exception'
} as PageSnapshot;
}
}