mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 03:32:25 +08:00
feat: bring your own html
This commit is contained in:
parent
78ea13b101
commit
1c4b64fe04
|
@ -686,7 +686,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
rpcReflect.return(sseStream);
|
||||
|
||||
try {
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
||||
if (!scrapped) {
|
||||
continue;
|
||||
}
|
||||
|
@ -713,7 +713,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
|
||||
let lastScrapped;
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
||||
lastScrapped = scrapped;
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
continue;
|
||||
|
@ -737,7 +737,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
return formatted;
|
||||
}
|
||||
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions.cacheTolerance)) {
|
||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, crawlerOptions)) {
|
||||
lastScrapped = scrapped;
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
continue;
|
||||
|
@ -880,8 +880,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
return r;
|
||||
}
|
||||
|
||||
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, cacheTolerance: number = this.cacheValidMs) {
|
||||
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
||||
if (crawlerOpts?.html) {
|
||||
const fakeSnapshot = {
|
||||
href: urlToCrawl.toString(),
|
||||
html: crawlerOpts.html,
|
||||
title: '',
|
||||
text: '',
|
||||
} as PageSnapshot;
|
||||
|
||||
yield this.puppeteerControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
||||
|
||||
return;
|
||||
}
|
||||
let cache;
|
||||
|
||||
const cacheTolerance = crawlerOpts?.cacheTolerance || this.cacheValidMs;
|
||||
if (cacheTolerance && !crawlOpts?.cookies?.length) {
|
||||
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
||||
}
|
||||
|
@ -934,8 +948,8 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||
}
|
||||
|
||||
|
||||
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, cacheTolerance?: number) {
|
||||
const iterators = urls.map((url) => this.cachedScrap(url, options, cacheTolerance));
|
||||
async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
||||
const iterators = urls.map((url) => this.cachedScrap(url, options, crawlerOpts));
|
||||
|
||||
const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined);
|
||||
|
||||
|
|
|
@ -142,6 +142,8 @@ export class SearcherHost extends RPCHost {
|
|||
});
|
||||
}
|
||||
|
||||
delete crawlerOptions.html;
|
||||
|
||||
const crawlOpts = this.crawler.configure(crawlerOptions);
|
||||
const cookies: CookieParam[] = [];
|
||||
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
||||
|
@ -171,7 +173,7 @@ export class SearcherHost extends RPCHost {
|
|||
}
|
||||
|
||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
|
||||
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
|
||||
{ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance || this.pageCacheToleranceMs }
|
||||
);
|
||||
|
||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||
|
@ -308,13 +310,13 @@ export class SearcherHost extends RPCHost {
|
|||
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
||||
searchResults?: WebSearchResult[],
|
||||
options?: ExtraScrappingOptions,
|
||||
pageCacheTolerance?: number
|
||||
crawlerOptions?: CrawlerOptions,
|
||||
) {
|
||||
if (!searchResults) {
|
||||
return;
|
||||
}
|
||||
const urls = searchResults.map((x) => new URL(x.url));
|
||||
for await (const scrapped of this.crawler.scrapMany(urls, options, pageCacheTolerance)) {
|
||||
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
||||
const mapped = scrapped.map((x, i) => {
|
||||
const upstreamSearchResult = searchResults[i];
|
||||
if (!x || (!x.parsed && mode !== 'markdown')) {
|
||||
|
|
|
@ -119,6 +119,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||
@Prop()
|
||||
url?: string;
|
||||
|
||||
@Prop()
|
||||
html?: string;
|
||||
|
||||
@Prop({
|
||||
default: 'default',
|
||||
})
|
||||
|
|
|
@ -653,7 +653,7 @@ document.addEventListener('load', handlePageLoad);
|
|||
targetSelector?: string | string[];
|
||||
removeSelector?: string | string[];
|
||||
}): PageSnapshot | undefined {
|
||||
if (!options?.targetSelector && !options?.removeSelector) {
|
||||
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
|
||||
return snapshot;
|
||||
}
|
||||
if (!snapshot?.html) {
|
||||
|
@ -663,15 +663,15 @@ document.addEventListener('load', handlePageLoad);
|
|||
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
||||
const allNodes: Node[] = [];
|
||||
|
||||
if (Array.isArray(options.removeSelector)) {
|
||||
if (Array.isArray(options?.removeSelector)) {
|
||||
for (const rl of options.removeSelector) {
|
||||
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
||||
}
|
||||
} else if (options.removeSelector) {
|
||||
} else if (options?.removeSelector) {
|
||||
jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove());
|
||||
}
|
||||
|
||||
if (Array.isArray(options.targetSelector)) {
|
||||
if (Array.isArray(options?.targetSelector)) {
|
||||
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
||||
x.forEach((el) => {
|
||||
if (!allNodes.includes(el)) {
|
||||
|
@ -679,7 +679,7 @@ document.addEventListener('load', handlePageLoad);
|
|||
}
|
||||
});
|
||||
}
|
||||
} else if (options.targetSelector) {
|
||||
} else if (options?.targetSelector) {
|
||||
jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => {
|
||||
if (!allNodes.includes(el)) {
|
||||
allNodes.push(el);
|
||||
|
@ -738,6 +738,7 @@ document.addEventListener('load', handlePageLoad);
|
|||
|
||||
const r = {
|
||||
...snapshot,
|
||||
title: snapshot.title || jsdom.window.document.title,
|
||||
parsed,
|
||||
html: rootDoc.documentElement.outerHTML,
|
||||
text: cleanedText,
|
||||
|
|
Loading…
Reference in New Issue
Block a user