From 1c4b64fe04aec11511df9df55a10ef59ee6112d9 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 25 Jul 2024 16:54:28 +0800 Subject: [PATCH] feat: bring your own html --- .../functions/src/cloud-functions/crawler.ts | 26 ++++++++++++++----- .../functions/src/cloud-functions/searcher.ts | 8 +++--- .../functions/src/dto/scrapping-options.ts | 3 +++ backend/functions/src/services/puppeteer.ts | 11 ++++---- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index cd1014b..76ad7d5 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -686,7 +686,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; rpcReflect.return(sseStream); try { - for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) { + for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) { if (!scrapped) { continue; } @@ -713,7 +713,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; let lastScrapped; if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { - for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) { + for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) { lastScrapped = scrapped; if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) { continue; @@ -737,7 +737,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; return formatted; } - for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) { + for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) { lastScrapped = scrapped; if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) { continue; @@ -880,8 +880,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; return r; } - async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) { + async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) { + if (crawlerOpts?.html) { + const fakeSnapshot = { + href: urlToCrawl.toString(), + html: crawlerOpts.html, + title: '', + text: '', + } as PageSnapshot; + + yield this.puppeteerControl.narrowSnapshot(fakeSnapshot, crawlOpts); + + return; + } let cache; + + const cacheTolerance = crawlerOpts?.cacheTolerance || this.cacheValidMs; if (cacheTolerance && !crawlOpts?.cookies?.length) { cache = await this.queryCache(urlToCrawl, cacheTolerance); } @@ -934,8 +948,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; } - async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) { - const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance)); + async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) { + const iterators = urls.map((url) => this.cachedScrap(url, options, crawlerOpts)); const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined); diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts index 031a4a8..1c696f1 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -142,6 +142,8 @@ export class SearcherHost extends RPCHost { }); } + delete crawlerOptions.html; + const crawlOpts = this.crawler.configure(crawlerOptions); const cookies: CookieParam[] = []; const setCookieHeaders = ctx.req.headers['x-set-cookie']; @@ -171,7 +173,7 @@ export class SearcherHost extends RPCHost { } const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts, - crawlerOptions.cacheTolerance || this.pageCacheToleranceMs + { ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance || this.pageCacheToleranceMs } ); if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { @@ -308,13 +310,13 @@ export class SearcherHost extends RPCHost { mode: string | 'markdown' | 'html' | 'text' | 'screenshot', searchResults?: WebSearchResult[], options?: ExtraScrappingOptions, - pageCacheTolerance?: number + crawlerOptions?: CrawlerOptions, ) { if (!searchResults) { return; } const urls = searchResults.map((x) => new URL(x.url)); - for await (const scrapped of this.crawler.scrapMany(urls, options, pageCacheTolerance)) { + for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) { const mapped = scrapped.map((x, i) => { const upstreamSearchResult = searchResults[i]; if (!x || (!x.parsed && mode !== 'markdown')) { diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 3405927..464ebc2 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -119,6 +119,9 @@ export class CrawlerOptions extends AutoCastable { @Prop() url?: string; + @Prop() + html?: string; + @Prop({ default: 'default', }) diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index ce69902..94279ef 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -653,7 +653,7 @@ document.addEventListener('load', handlePageLoad); targetSelector?: string | string[]; removeSelector?: string | string[]; }): PageSnapshot | undefined { - if (!options?.targetSelector && !options?.removeSelector) { + if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) { return snapshot; } if (!snapshot?.html) { @@ -663,15 +663,15 @@ document.addEventListener('load', handlePageLoad); const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); const allNodes: Node[] = []; - if (Array.isArray(options.removeSelector)) { + if (Array.isArray(options?.removeSelector)) { for (const rl of options.removeSelector) { jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove()); } - } else if (options.removeSelector) { + } else if (options?.removeSelector) { jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove()); } - if (Array.isArray(options.targetSelector)) { + if (Array.isArray(options?.targetSelector)) { for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) { x.forEach((el) => { if (!allNodes.includes(el)) { @@ -679,7 +679,7 @@ document.addEventListener('load', handlePageLoad); } }); } - } else if (options.targetSelector) { + } else if (options?.targetSelector) { jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => { if (!allNodes.includes(el)) { allNodes.push(el); @@ -738,6 +738,7 @@ document.addEventListener('load', handlePageLoad); const r = { ...snapshot, + title: snapshot.title || jsdom.window.document.title, parsed, html: rootDoc.documentElement.outerHTML, text: cleanedText,