The issue with the server crash has been solved, if the user made a mistake in the URL.

This commit is contained in:
Filip Dvoran 2024-10-01 14:41:19 +02:00
parent 008e00940b
commit e7dc0366bf
3 changed files with 198 additions and 82 deletions

View File

@ -651,7 +651,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
} }
} catch (error) { } catch (error) {
console.log('Invalid URL:', urlToCrawl, error); console.log('Invalid URL:', urlToCrawl, error);
return sendResponse(res, 'Invalid URL', { contentType: 'text/plain', envelope: null, code: 400 }); return sendResponse(res, 'Invalid URL or TLD', { contentType: 'text/plain', envelope: null, code: 400 });
} }
// Prevent circular crawling // Prevent circular crawling
@ -663,6 +663,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
let lastScrapped: PageSnapshot | undefined; let lastScrapped: PageSnapshot | undefined;
try {
for await (const scrapped of this.scrap(parsedUrl, crawlOpts, crawlerOptions)) { for await (const scrapped of this.scrap(parsedUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped; lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) { if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
@ -675,6 +676,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith); return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
} }
} }
} catch (scrapError: any) {
console.error('Error during scraping:', scrapError);
if (scrapError instanceof AssertionFailureError &&
(scrapError.message.includes('Invalid TLD') || scrapError.message.includes('ERR_NAME_NOT_RESOLVED'))) {
const errorSnapshot: PageSnapshot = {
title: 'Error: Invalid domain or TLD',
href: parsedUrl.toString(),
html: '',
text: `Failed to access the page due to an invalid domain or TLD: ${parsedUrl.toString()}`,
error: 'Invalid domain or TLD'
};
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, errorSnapshot, parsedUrl);
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
}
throw scrapError; // Re-throw if it's not a handled error
}
if (!lastScrapped) { if (!lastScrapped) {
return sendResponse(res, 'No content available', { contentType: 'text/plain', envelope: null, code: 404 }); return sendResponse(res, 'No content available', { contentType: 'text/plain', envelope: null, code: 404 });

View File

@ -17,10 +17,17 @@ app.use('/instant-screenshots', express.static(path.join('/app', 'local-storage'
app.all('*', async (req, res) => { app.all('*', async (req, res) => {
try { try {
await crawlerHost.crawl(req, res); await crawlerHost.crawl(req, res);
} catch (error) { } catch (error: any) {
console.error('Error during crawl:', error); console.error('Error during crawl:', error);
// Kontrola typu chyby
if (error.message.includes('Invalid TLD')) {
res.status(400).json({ error: 'Invalid URL or TLD' });
} else {
// Ošetrenie iných chýb
res.status(500).json({ error: 'An error occurred during the crawl' }); res.status(500).json({ error: 'An error occurred during the crawl' });
} }
}
}); });
app.listen(port, () => { app.listen(port, () => {

View File

@ -63,6 +63,7 @@ export interface PageSnapshot {
maxElemDepth?: number; maxElemDepth?: number;
elemCount?: number; elemCount?: number;
childFrames?: PageSnapshot[]; childFrames?: PageSnapshot[];
error?: string;
} }
export interface ExtendedSnapshot extends PageSnapshot { export interface ExtendedSnapshot extends PageSnapshot {
@ -316,6 +317,17 @@ export class PuppeteerControl extends AsyncService {
this.logger.warn(`Browser killed`); this.logger.warn(`Browser killed`);
} }
private extractDomain(url: string): string {
try {
const { hostname } = new URL(url);
const parts = hostname.split('.');
return parts.length > 1 ? parts.slice(-2).join('.') : hostname;
} catch (error: any) {
this.logger.warn(`Failed to extract domain from URL: ${url}. Error: ${error.message}`);
return url;
}
}
async newPage() { async newPage() {
await this.serviceReady(); await this.serviceReady();
const dedicatedContext = await this.browser.createBrowserContext(); const dedicatedContext = await this.browser.createBrowserContext();
@ -355,8 +367,15 @@ export class PuppeteerControl extends AsyncService {
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') { if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
return req.abort('blockedbyclient', 1000); return req.abort('blockedbyclient', 1000);
} }
try {
const tldParsed = tldExtract(requestUrl); const tldParsed = tldExtract(requestUrl);
domainSet.add(tldParsed.domain); domainSet.add(tldParsed.domain);
} catch (error) {
this.logger.warn(`Failed to parse TLD for URL: ${requestUrl}. Using fallback method.`);
const simpleDomain = this.extractDomain(requestUrl);
domainSet.add(simpleDomain);
}
const parsedUrl = new URL(requestUrl); const parsedUrl = new URL(requestUrl);
@ -547,15 +566,29 @@ document.addEventListener('load', handlePageLoad);
const timeout = options?.timeoutMs || 30_000; const timeout = options?.timeoutMs || 30_000;
const gotoPromise = page.goto(url, { try {
let waitForPromise: Promise<any> | undefined;
let gotoPromise: Promise<PageSnapshot | void>;
gotoPromise = page.goto(url, {
waitUntil: ['load', 'domcontentloaded', 'networkidle0'], waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
timeout, timeout,
}) })
.catch((err) => { .catch((err: any) => {
if (err instanceof TimeoutError) { if (err instanceof TimeoutError || err.message.includes('ERR_NAME_NOT_RESOLVED')) {
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) }); this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) });
return {
title: 'Error: Unable to access page',
href: url,
html: '',
text: `Failed to access the page: ${err.message}`,
error: err.message
} as PageSnapshot;
}
if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) {
this.logger.warn(`Page ${sn}: Invalid domain or TLD for ${url}`, { err: marshalErrorLike(err) });
return new AssertionFailureError({ return new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`, message: `Invalid domain or TLD for ${url}: ${err}`,
cause: err, cause: err,
}); });
} }
@ -622,7 +655,6 @@ document.addEventListener('load', handlePageLoad);
} }
}); });
let waitForPromise: Promise<any> | undefined;
if (options?.waitForSelector) { if (options?.waitForSelector) {
console.log('Waiting for selector', options.waitForSelector); console.log('Waiting for selector', options.waitForSelector);
const t0 = Date.now(); const t0 = Date.now();
@ -663,7 +695,17 @@ document.addEventListener('load', handlePageLoad);
ckpt.push(delay(options.minIntervalMs)); ckpt.push(delay(options.minIntervalMs));
} }
let error; let error;
await Promise.race(ckpt).catch((err) => error = err); await Promise.race(ckpt).catch((err) => {
if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) {
this.logger.warn(`Invalid domain or TLD encountered: ${err.message}`);
error = new AssertionFailureError({
message: `Invalid domain or TLD for ${url}: ${err.message}`,
cause: err,
});
} else {
error = err;
}
});
if (finalized && !error) { if (finalized && !error) {
if (!snapshot && !screenshot) { if (!snapshot && !screenshot) {
if (error) { if (error) {
@ -683,16 +725,66 @@ document.addEventListener('load', handlePageLoad);
yield { ...snapshot, screenshot, pageshot } as PageSnapshot; yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
} }
if (error) { if (error) {
if (error instanceof AssertionFailureError &&
(error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) {
this.logger.warn(`Continuing despite Invalid domain or TLD: ${error.message}`);
yield {
title: '',
href: url,
html: '',
text: '',
screenshot,
pageshot,
error: 'Invalid domain or TLD'
} as PageSnapshot;
break;
} else {
throw error; throw error;
} }
} }
}
} catch (error: any) {
if (error.message && (error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) {
this.logger.warn(`Invalid domain or TLD encountered: ${error.message}`);
yield {
title: '',
href: url,
html: '',
text: '',
screenshot,
pageshot,
error: 'Invalid domain or TLD'
} as PageSnapshot;
} else {
throw error;
}
} finally { } finally {
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => { if (typeof waitForPromise !== 'undefined' && typeof gotoPromise !== 'undefined') {
Promise.allSettled([gotoPromise, waitForPromise]).finally(() => {
page.off('snapshot', hdl); page.off('snapshot', hdl);
this.ditchPage(page); this.ditchPage(page);
}); });
} else if (typeof gotoPromise !== 'undefined') {
gotoPromise.finally(() => {
page.off('snapshot', hdl);
this.ditchPage(page);
});
} else {
page.off('snapshot', hdl);
this.ditchPage(page);
}
nextSnapshotDeferred.resolve(); nextSnapshotDeferred.resolve();
} }
} catch (error: any) {
this.logger.error(`Unhandled error in scrap method:`, error);
yield {
title: 'Error: Unhandled exception',
href: url,
html: '',
text: `An unexpected error occurred: ${error.message}`,
error: 'Unhandled exception'
} as PageSnapshot;
}
} }
async salvage(url: string, page: Page) { async salvage(url: string, page: Page) {