mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 03:32:25 +08:00
Merge pull request #9 from iOSDevSK/main
The issue with the server crash has been solved.
This commit is contained in:
commit
d5eee95175
|
@ -651,7 +651,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log('Invalid URL:', urlToCrawl, error);
|
console.log('Invalid URL:', urlToCrawl, error);
|
||||||
return sendResponse(res, 'Invalid URL', { contentType: 'text/plain', envelope: null, code: 400 });
|
return sendResponse(res, 'Invalid URL or TLD', { contentType: 'text/plain', envelope: null, code: 400 });
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prevent circular crawling
|
// Prevent circular crawling
|
||||||
|
@ -663,17 +663,34 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
|
|
||||||
let lastScrapped: PageSnapshot | undefined;
|
let lastScrapped: PageSnapshot | undefined;
|
||||||
|
|
||||||
for await (const scrapped of this.scrap(parsedUrl, crawlOpts, crawlerOptions)) {
|
try {
|
||||||
lastScrapped = scrapped;
|
for await (const scrapped of this.scrap(parsedUrl, crawlOpts, crawlerOptions)) {
|
||||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
lastScrapped = scrapped;
|
||||||
continue;
|
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, parsedUrl);
|
||||||
|
|
||||||
|
if (crawlerOptions.timeout === undefined) {
|
||||||
|
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} catch (scrapError: any) {
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, parsedUrl);
|
console.error('Error during scraping:', scrapError);
|
||||||
|
if (scrapError instanceof AssertionFailureError &&
|
||||||
if (crawlerOptions.timeout === undefined) {
|
(scrapError.message.includes('Invalid TLD') || scrapError.message.includes('ERR_NAME_NOT_RESOLVED'))) {
|
||||||
|
const errorSnapshot: PageSnapshot = {
|
||||||
|
title: 'Error: Invalid domain or TLD',
|
||||||
|
href: parsedUrl.toString(),
|
||||||
|
html: '',
|
||||||
|
text: `Failed to access the page due to an invalid domain or TLD: ${parsedUrl.toString()}`,
|
||||||
|
error: 'Invalid domain or TLD'
|
||||||
|
};
|
||||||
|
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, errorSnapshot, parsedUrl);
|
||||||
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
|
return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith);
|
||||||
}
|
}
|
||||||
|
throw scrapError; // Re-throw if it's not a handled error
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!lastScrapped) {
|
if (!lastScrapped) {
|
||||||
|
|
|
@ -17,9 +17,16 @@ app.use('/instant-screenshots', express.static(path.join('/app', 'local-storage'
|
||||||
app.all('*', async (req, res) => {
|
app.all('*', async (req, res) => {
|
||||||
try {
|
try {
|
||||||
await crawlerHost.crawl(req, res);
|
await crawlerHost.crawl(req, res);
|
||||||
} catch (error) {
|
} catch (error: any) {
|
||||||
console.error('Error during crawl:', error);
|
console.error('Error during crawl:', error);
|
||||||
res.status(500).json({ error: 'An error occurred during the crawl' });
|
|
||||||
|
// Kontrola typu chyby
|
||||||
|
if (error.message.includes('Invalid TLD')) {
|
||||||
|
res.status(400).json({ error: 'Invalid URL or TLD' });
|
||||||
|
} else {
|
||||||
|
// Ošetrenie iných chýb
|
||||||
|
res.status(500).json({ error: 'An error occurred during the crawl' });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -63,6 +63,7 @@ export interface PageSnapshot {
|
||||||
maxElemDepth?: number;
|
maxElemDepth?: number;
|
||||||
elemCount?: number;
|
elemCount?: number;
|
||||||
childFrames?: PageSnapshot[];
|
childFrames?: PageSnapshot[];
|
||||||
|
error?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ExtendedSnapshot extends PageSnapshot {
|
export interface ExtendedSnapshot extends PageSnapshot {
|
||||||
|
@ -316,6 +317,17 @@ export class PuppeteerControl extends AsyncService {
|
||||||
this.logger.warn(`Browser killed`);
|
this.logger.warn(`Browser killed`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private extractDomain(url: string): string {
|
||||||
|
try {
|
||||||
|
const { hostname } = new URL(url);
|
||||||
|
const parts = hostname.split('.');
|
||||||
|
return parts.length > 1 ? parts.slice(-2).join('.') : hostname;
|
||||||
|
} catch (error: any) {
|
||||||
|
this.logger.warn(`Failed to extract domain from URL: ${url}. Error: ${error.message}`);
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async newPage() {
|
async newPage() {
|
||||||
await this.serviceReady();
|
await this.serviceReady();
|
||||||
const dedicatedContext = await this.browser.createBrowserContext();
|
const dedicatedContext = await this.browser.createBrowserContext();
|
||||||
|
@ -355,8 +367,15 @@ export class PuppeteerControl extends AsyncService {
|
||||||
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
||||||
return req.abort('blockedbyclient', 1000);
|
return req.abort('blockedbyclient', 1000);
|
||||||
}
|
}
|
||||||
const tldParsed = tldExtract(requestUrl);
|
|
||||||
domainSet.add(tldParsed.domain);
|
try {
|
||||||
|
const tldParsed = tldExtract(requestUrl);
|
||||||
|
domainSet.add(tldParsed.domain);
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.warn(`Failed to parse TLD for URL: ${requestUrl}. Using fallback method.`);
|
||||||
|
const simpleDomain = this.extractDomain(requestUrl);
|
||||||
|
domainSet.add(simpleDomain);
|
||||||
|
}
|
||||||
|
|
||||||
const parsedUrl = new URL(requestUrl);
|
const parsedUrl = new URL(requestUrl);
|
||||||
|
|
||||||
|
@ -547,15 +566,29 @@ document.addEventListener('load', handlePageLoad);
|
||||||
|
|
||||||
const timeout = options?.timeoutMs || 30_000;
|
const timeout = options?.timeoutMs || 30_000;
|
||||||
|
|
||||||
const gotoPromise = page.goto(url, {
|
try {
|
||||||
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
let waitForPromise: Promise<any> | undefined;
|
||||||
timeout,
|
let gotoPromise: Promise<PageSnapshot | void>;
|
||||||
})
|
|
||||||
.catch((err) => {
|
gotoPromise = page.goto(url, {
|
||||||
if (err instanceof TimeoutError) {
|
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
||||||
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) });
|
timeout,
|
||||||
|
})
|
||||||
|
.catch((err: any) => {
|
||||||
|
if (err instanceof TimeoutError || err.message.includes('ERR_NAME_NOT_RESOLVED')) {
|
||||||
|
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) });
|
||||||
|
return {
|
||||||
|
title: 'Error: Unable to access page',
|
||||||
|
href: url,
|
||||||
|
html: '',
|
||||||
|
text: `Failed to access the page: ${err.message}`,
|
||||||
|
error: err.message
|
||||||
|
} as PageSnapshot;
|
||||||
|
}
|
||||||
|
if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) {
|
||||||
|
this.logger.warn(`Page ${sn}: Invalid domain or TLD for ${url}`, { err: marshalErrorLike(err) });
|
||||||
return new AssertionFailureError({
|
return new AssertionFailureError({
|
||||||
message: `Failed to goto ${url}: ${err}`,
|
message: `Invalid domain or TLD for ${url}: ${err}`,
|
||||||
cause: err,
|
cause: err,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -622,76 +655,135 @@ document.addEventListener('load', handlePageLoad);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let waitForPromise: Promise<any> | undefined;
|
if (options?.waitForSelector) {
|
||||||
if (options?.waitForSelector) {
|
console.log('Waiting for selector', options.waitForSelector);
|
||||||
console.log('Waiting for selector', options.waitForSelector);
|
const t0 = Date.now();
|
||||||
const t0 = Date.now();
|
waitForPromise = nextSnapshotDeferred.promise.then(() => {
|
||||||
waitForPromise = nextSnapshotDeferred.promise.then(() => {
|
const t1 = Date.now();
|
||||||
const t1 = Date.now();
|
const elapsed = t1 - t0;
|
||||||
const elapsed = t1 - t0;
|
const remaining = timeout - elapsed;
|
||||||
const remaining = timeout - elapsed;
|
const thisTimeout = remaining > 100 ? remaining : 100;
|
||||||
const thisTimeout = remaining > 100 ? remaining : 100;
|
const p = (Array.isArray(options.waitForSelector) ?
|
||||||
const p = (Array.isArray(options.waitForSelector) ?
|
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
||||||
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
||||||
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
.then(async () => {
|
||||||
.then(async () => {
|
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
screenshot = await page.screenshot();
|
||||||
|
pageshot = await page.screenshot({ fullPage: true });
|
||||||
|
if (snapshot) {
|
||||||
|
snapshot.childFrames = await pSubFrameSnapshots;
|
||||||
|
}
|
||||||
|
finalized = true;
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
|
||||||
|
waitForPromise = undefined;
|
||||||
|
});
|
||||||
|
return p as any;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
let lastHTML = snapshot?.html;
|
||||||
|
while (true) {
|
||||||
|
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
||||||
|
if (waitForPromise) {
|
||||||
|
ckpt.push(waitForPromise);
|
||||||
|
}
|
||||||
|
if (options?.minIntervalMs) {
|
||||||
|
ckpt.push(delay(options.minIntervalMs));
|
||||||
|
}
|
||||||
|
let error;
|
||||||
|
await Promise.race(ckpt).catch((err) => {
|
||||||
|
if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) {
|
||||||
|
this.logger.warn(`Invalid domain or TLD encountered: ${err.message}`);
|
||||||
|
error = new AssertionFailureError({
|
||||||
|
message: `Invalid domain or TLD for ${url}: ${err.message}`,
|
||||||
|
cause: err,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
error = err;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (finalized && !error) {
|
||||||
|
if (!snapshot && !screenshot) {
|
||||||
|
if (error) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
||||||
|
}
|
||||||
|
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot();
|
||||||
pageshot = await page.screenshot({ fullPage: true });
|
pageshot = await page.screenshot({ fullPage: true });
|
||||||
if (snapshot) {
|
lastHTML = snapshot.html;
|
||||||
snapshot.childFrames = await pSubFrameSnapshots;
|
}
|
||||||
}
|
if (snapshot || screenshot) {
|
||||||
finalized = true;
|
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
||||||
})
|
}
|
||||||
.catch((err) => {
|
if (error) {
|
||||||
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
|
if (error instanceof AssertionFailureError &&
|
||||||
waitForPromise = undefined;
|
(error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) {
|
||||||
});
|
this.logger.warn(`Continuing despite Invalid domain or TLD: ${error.message}`);
|
||||||
return p as any;
|
yield {
|
||||||
});
|
title: '',
|
||||||
}
|
href: url,
|
||||||
|
html: '',
|
||||||
try {
|
text: '',
|
||||||
let lastHTML = snapshot?.html;
|
screenshot,
|
||||||
while (true) {
|
pageshot,
|
||||||
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
error: 'Invalid domain or TLD'
|
||||||
if (waitForPromise) {
|
} as PageSnapshot;
|
||||||
ckpt.push(waitForPromise);
|
break;
|
||||||
}
|
} else {
|
||||||
if (options?.minIntervalMs) {
|
|
||||||
ckpt.push(delay(options.minIntervalMs));
|
|
||||||
}
|
|
||||||
let error;
|
|
||||||
await Promise.race(ckpt).catch((err) => error = err);
|
|
||||||
if (finalized && !error) {
|
|
||||||
if (!snapshot && !screenshot) {
|
|
||||||
if (error) {
|
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
|
||||||
}
|
}
|
||||||
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
} catch (error: any) {
|
||||||
screenshot = await page.screenshot();
|
if (error.message && (error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) {
|
||||||
pageshot = await page.screenshot({ fullPage: true });
|
this.logger.warn(`Invalid domain or TLD encountered: ${error.message}`);
|
||||||
lastHTML = snapshot.html;
|
yield {
|
||||||
}
|
title: '',
|
||||||
if (snapshot || screenshot) {
|
href: url,
|
||||||
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
html: '',
|
||||||
}
|
text: '',
|
||||||
if (error) {
|
screenshot,
|
||||||
|
pageshot,
|
||||||
|
error: 'Invalid domain or TLD'
|
||||||
|
} as PageSnapshot;
|
||||||
|
} else {
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
|
} finally {
|
||||||
|
if (typeof waitForPromise !== 'undefined' && typeof gotoPromise !== 'undefined') {
|
||||||
|
Promise.allSettled([gotoPromise, waitForPromise]).finally(() => {
|
||||||
|
page.off('snapshot', hdl);
|
||||||
|
this.ditchPage(page);
|
||||||
|
});
|
||||||
|
} else if (typeof gotoPromise !== 'undefined') {
|
||||||
|
gotoPromise.finally(() => {
|
||||||
|
page.off('snapshot', hdl);
|
||||||
|
this.ditchPage(page);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
page.off('snapshot', hdl);
|
||||||
|
this.ditchPage(page);
|
||||||
|
}
|
||||||
|
nextSnapshotDeferred.resolve();
|
||||||
}
|
}
|
||||||
} finally {
|
} catch (error: any) {
|
||||||
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
this.logger.error(`Unhandled error in scrap method:`, error);
|
||||||
page.off('snapshot', hdl);
|
yield {
|
||||||
this.ditchPage(page);
|
title: 'Error: Unhandled exception',
|
||||||
});
|
href: url,
|
||||||
nextSnapshotDeferred.resolve();
|
html: '',
|
||||||
|
text: `An unexpected error occurred: ${error.message}`,
|
||||||
|
error: 'Unhandled exception'
|
||||||
|
} as PageSnapshot;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user