fix(crawler): relative URL handling on non-start pages (#893)

* fix(crawler): relative URL handling on non-start pages

* fix(crawl): further fixing
This commit is contained in:
Gergő Móricz 2024-11-12 18:20:53 +01:00 committed by GitHub
parent 740a429790
commit fbabc779f5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 5 additions and 5 deletions

View File

@ -166,10 +166,10 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
return res;
}
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
export function crawlToCrawler(id: string, sc: StoredCrawl, initialUrl?: string): WebCrawler {
const crawler = new WebCrawler({
jobId: id,
initialUrl: sc.originUrl!,
initialUrl: initialUrl ?? sc.originUrl!,
includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,

View File

@ -171,7 +171,7 @@ export class WebCrawler {
let fullUrl = href;
if (!href.startsWith("http")) {
try {
fullUrl = new URL(href, this.baseUrl).toString();
fullUrl = new URL(href, url).toString();
} catch (_) {
return null;
}

View File

@ -352,10 +352,10 @@ async function processJob(job: Job & { id: string }, token: string) {
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata?.url ?? doc.metadata?.sourceURL ?? undefined);
const links = crawler.filterLinks(
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl as string),
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
Infinity,
sc.crawlerOptions?.maxDepth ?? 10
);