mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-15 19:22:20 +08:00
The issue with the server crash has been solved, if the user made a mistake in the URL.
This commit is contained in:
parent
008e00940b
commit
e7dc0366bf
|
@ -651,7 +651,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
}
|
||||
} catch (error) {
|
||||
console.log('Invalid URL:', urlToCrawl, error);
|
||||
return sendResponse(res, 'Invalid URL', { contentType: 'text/plain', envelope: null, code: 400 });
|
||||
return sendResponse(res, 'Invalid URL or TLD', { contentType: 'text/plain', envelope: null, code: 400 });
|
||||
}
|
||||
|
||||
// Prevent circular crawling
|
||||
|
@ -663,6 +663,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
|
||||
let lastScrapped: PageSnapshot | undefined;
|
||||
|
||||
try {
|
||||
for await (const scrapped of this.scrap(parsedUrl, crawlOpts, crawlerOptions)) {
|
||||
lastScrapped = scrapped;
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
|
@ -675,6 +676,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
|
||||
}
|
||||
}
|
||||
} catch (scrapError: any) {
|
||||
console.error('Error during scraping:', scrapError);
|
||||
if (scrapError instanceof AssertionFailureError &&
|
||||
(scrapError.message.includes('Invalid TLD') || scrapError.message.includes('ERR_NAME_NOT_RESOLVED'))) {
|
||||
const errorSnapshot: PageSnapshot = {
|
||||
title: 'Error: Invalid domain or TLD',
|
||||
href: parsedUrl.toString(),
|
||||
html: '',
|
||||
text: `Failed to access the page due to an invalid domain or TLD: ${parsedUrl.toString()}`,
|
||||
error: 'Invalid domain or TLD'
|
||||
};
|
||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, errorSnapshot, parsedUrl);
|
||||
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
|
||||
}
|
||||
throw scrapError; // Re-throw if it's not a handled error
|
||||
}
|
||||
|
||||
if (!lastScrapped) {
|
||||
return sendResponse(res, 'No content available', { contentType: 'text/plain', envelope: null, code: 404 });
|
||||
|
|
|
@ -17,10 +17,17 @@ app.use('/instant-screenshots', express.static(path.join('/app', 'local-storage'
|
|||
app.all('*', async (req, res) => {
|
||||
try {
|
||||
await crawlerHost.crawl(req, res);
|
||||
} catch (error) {
|
||||
} catch (error: any) {
|
||||
console.error('Error during crawl:', error);
|
||||
|
||||
// Kontrola typu chyby
|
||||
if (error.message.includes('Invalid TLD')) {
|
||||
res.status(400).json({ error: 'Invalid URL or TLD' });
|
||||
} else {
|
||||
// Ošetrenie iných chýb
|
||||
res.status(500).json({ error: 'An error occurred during the crawl' });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
app.listen(port, () => {
|
||||
|
|
|
@ -63,6 +63,7 @@ export interface PageSnapshot {
|
|||
maxElemDepth?: number;
|
||||
elemCount?: number;
|
||||
childFrames?: PageSnapshot[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface ExtendedSnapshot extends PageSnapshot {
|
||||
|
@ -316,6 +317,17 @@ export class PuppeteerControl extends AsyncService {
|
|||
this.logger.warn(`Browser killed`);
|
||||
}
|
||||
|
||||
private extractDomain(url: string): string {
|
||||
try {
|
||||
const { hostname } = new URL(url);
|
||||
const parts = hostname.split('.');
|
||||
return parts.length > 1 ? parts.slice(-2).join('.') : hostname;
|
||||
} catch (error: any) {
|
||||
this.logger.warn(`Failed to extract domain from URL: ${url}. Error: ${error.message}`);
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
async newPage() {
|
||||
await this.serviceReady();
|
||||
const dedicatedContext = await this.browser.createBrowserContext();
|
||||
|
@ -355,8 +367,15 @@ export class PuppeteerControl extends AsyncService {
|
|||
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
|
||||
try {
|
||||
const tldParsed = tldExtract(requestUrl);
|
||||
domainSet.add(tldParsed.domain);
|
||||
} catch (error) {
|
||||
this.logger.warn(`Failed to parse TLD for URL: ${requestUrl}. Using fallback method.`);
|
||||
const simpleDomain = this.extractDomain(requestUrl);
|
||||
domainSet.add(simpleDomain);
|
||||
}
|
||||
|
||||
const parsedUrl = new URL(requestUrl);
|
||||
|
||||
|
@ -547,15 +566,29 @@ document.addEventListener('load', handlePageLoad);
|
|||
|
||||
const timeout = options?.timeoutMs || 30_000;
|
||||
|
||||
const gotoPromise = page.goto(url, {
|
||||
try {
|
||||
let waitForPromise: Promise<any> | undefined;
|
||||
let gotoPromise: Promise<PageSnapshot | void>;
|
||||
|
||||
gotoPromise = page.goto(url, {
|
||||
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
||||
timeout,
|
||||
})
|
||||
.catch((err) => {
|
||||
if (err instanceof TimeoutError) {
|
||||
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) });
|
||||
.catch((err: any) => {
|
||||
if (err instanceof TimeoutError || err.message.includes('ERR_NAME_NOT_RESOLVED')) {
|
||||
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) });
|
||||
return {
|
||||
title: 'Error: Unable to access page',
|
||||
href: url,
|
||||
html: '',
|
||||
text: `Failed to access the page: ${err.message}`,
|
||||
error: err.message
|
||||
} as PageSnapshot;
|
||||
}
|
||||
if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) {
|
||||
this.logger.warn(`Page ${sn}: Invalid domain or TLD for ${url}`, { err: marshalErrorLike(err) });
|
||||
return new AssertionFailureError({
|
||||
message: `Failed to goto ${url}: ${err}`,
|
||||
message: `Invalid domain or TLD for ${url}: ${err}`,
|
||||
cause: err,
|
||||
});
|
||||
}
|
||||
|
@ -622,7 +655,6 @@ document.addEventListener('load', handlePageLoad);
|
|||
}
|
||||
});
|
||||
|
||||
let waitForPromise: Promise<any> | undefined;
|
||||
if (options?.waitForSelector) {
|
||||
console.log('Waiting for selector', options.waitForSelector);
|
||||
const t0 = Date.now();
|
||||
|
@ -663,7 +695,17 @@ document.addEventListener('load', handlePageLoad);
|
|||
ckpt.push(delay(options.minIntervalMs));
|
||||
}
|
||||
let error;
|
||||
await Promise.race(ckpt).catch((err) => error = err);
|
||||
await Promise.race(ckpt).catch((err) => {
|
||||
if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) {
|
||||
this.logger.warn(`Invalid domain or TLD encountered: ${err.message}`);
|
||||
error = new AssertionFailureError({
|
||||
message: `Invalid domain or TLD for ${url}: ${err.message}`,
|
||||
cause: err,
|
||||
});
|
||||
} else {
|
||||
error = err;
|
||||
}
|
||||
});
|
||||
if (finalized && !error) {
|
||||
if (!snapshot && !screenshot) {
|
||||
if (error) {
|
||||
|
@ -683,16 +725,66 @@ document.addEventListener('load', handlePageLoad);
|
|||
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
||||
}
|
||||
if (error) {
|
||||
if (error instanceof AssertionFailureError &&
|
||||
(error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) {
|
||||
this.logger.warn(`Continuing despite Invalid domain or TLD: ${error.message}`);
|
||||
yield {
|
||||
title: '',
|
||||
href: url,
|
||||
html: '',
|
||||
text: '',
|
||||
screenshot,
|
||||
pageshot,
|
||||
error: 'Invalid domain or TLD'
|
||||
} as PageSnapshot;
|
||||
break;
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error.message && (error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) {
|
||||
this.logger.warn(`Invalid domain or TLD encountered: ${error.message}`);
|
||||
yield {
|
||||
title: '',
|
||||
href: url,
|
||||
html: '',
|
||||
text: '',
|
||||
screenshot,
|
||||
pageshot,
|
||||
error: 'Invalid domain or TLD'
|
||||
} as PageSnapshot;
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
} finally {
|
||||
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
||||
if (typeof waitForPromise !== 'undefined' && typeof gotoPromise !== 'undefined') {
|
||||
Promise.allSettled([gotoPromise, waitForPromise]).finally(() => {
|
||||
page.off('snapshot', hdl);
|
||||
this.ditchPage(page);
|
||||
});
|
||||
} else if (typeof gotoPromise !== 'undefined') {
|
||||
gotoPromise.finally(() => {
|
||||
page.off('snapshot', hdl);
|
||||
this.ditchPage(page);
|
||||
});
|
||||
} else {
|
||||
page.off('snapshot', hdl);
|
||||
this.ditchPage(page);
|
||||
}
|
||||
nextSnapshotDeferred.resolve();
|
||||
}
|
||||
} catch (error: any) {
|
||||
this.logger.error(`Unhandled error in scrap method:`, error);
|
||||
yield {
|
||||
title: 'Error: Unhandled exception',
|
||||
href: url,
|
||||
html: '',
|
||||
text: `An unexpected error occurred: ${error.message}`,
|
||||
error: 'Unhandled exception'
|
||||
} as PageSnapshot;
|
||||
}
|
||||
}
|
||||
|
||||
async salvage(url: string, page: Page) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user