feat: bring your own html

This commit is contained in:
Yanlong Wang 2024-07-25 16:54:28 +08:00
parent 78ea13b101
commit 1c4b64fe04
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 34 additions and 14 deletions

View File

@ -686,7 +686,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
rpcReflect.return(sseStream);
try {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
if (!scrapped) {
continue;
}
@ -713,7 +713,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
let lastScrapped;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
@ -737,7 +737,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return formatted;
}
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
@ -880,8 +880,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return r;
}
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
if (crawlerOpts?.html) {
const fakeSnapshot = {
href: urlToCrawl.toString(),
html: crawlerOpts.html,
title: '',
text: '',
} as PageSnapshot;
yield this.puppeteerControl.narrowSnapshot(fakeSnapshot, crawlOpts);
return;
}
let cache;
const cacheTolerance = crawlerOpts?.cacheTolerance || this.cacheValidMs;
if (cacheTolerance && !crawlOpts?.cookies?.length) {
cache = await this.queryCache(urlToCrawl, cacheTolerance);
}
@ -934,8 +948,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) {
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
const iterators = urls.map((url) => this.cachedScrap(url, options, crawlerOpts));
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);

View File

@ -142,6 +142,8 @@ export class SearcherHost extends RPCHost {
});
}
delete crawlerOptions.html;
const crawlOpts = this.crawler.configure(crawlerOptions);
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
@ -171,7 +173,7 @@ export class SearcherHost extends RPCHost {
}
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
{ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance || this.pageCacheToleranceMs }
);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
@ -308,13 +310,13 @@ export class SearcherHost extends RPCHost {
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
searchResults?: WebSearchResult[],
options?: ExtraScrappingOptions,
pageCacheTolerance?: number
crawlerOptions?: CrawlerOptions,
) {
if (!searchResults) {
return;
}
const urls = searchResults.map((x) => new URL(x.url));
for await (const scrapped of this.crawler.scrapMany(urls, options, pageCacheTolerance)) {
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
const mapped = scrapped.map((x, i) => {
const upstreamSearchResult = searchResults[i];
if (!x || (!x.parsed && mode !== 'markdown')) {

View File

@ -119,6 +119,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
url?: string;
@Prop()
html?: string;
@Prop({
default: 'default',
})

View File

@ -653,7 +653,7 @@ document.addEventListener('load', handlePageLoad);
targetSelector?: string | string[];
removeSelector?: string | string[];
}): PageSnapshot | undefined {
if (!options?.targetSelector && !options?.removeSelector) {
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
return snapshot;
}
if (!snapshot?.html) {
@ -663,15 +663,15 @@ document.addEventListener('load', handlePageLoad);
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
const allNodes: Node[] = [];
if (Array.isArray(options.removeSelector)) {
if (Array.isArray(options?.removeSelector)) {
for (const rl of options.removeSelector) {
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
}
} else if (options.removeSelector) {
} else if (options?.removeSelector) {
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
}
if (Array.isArray(options.targetSelector)) {
if (Array.isArray(options?.targetSelector)) {
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
x.forEach((el) => {
if (!allNodes.includes(el)) {
@ -679,7 +679,7 @@ document.addEventListener('load', handlePageLoad);
}
});
}
} else if (options.targetSelector) {
} else if (options?.targetSelector) {
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
if (!allNodes.includes(el)) {
allNodes.push(el);
@ -738,6 +738,7 @@ document.addEventListener('load', handlePageLoad);
const r = {
...snapshot,
title: snapshot.title || jsdom.window.document.title,
parsed,
html: rootDoc.documentElement.outerHTML,
text: cleanedText,