From 5ce4aaf0ec534d9b359a1a6e8e7c7229742212cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 23:35:07 +0100 Subject: [PATCH] fix(crawl): initialURL setting is unnecessary --- apps/api/src/lib/crawl-redis.ts | 4 ++-- apps/api/src/services/queue-worker.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index bd79a86d..b5936ad6 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -166,10 +166,10 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro return res; } -export function crawlToCrawler(id: string, sc: StoredCrawl, initialUrl?: string): WebCrawler { +export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { const crawler = new WebCrawler({ jobId: id, - initialUrl: initialUrl ?? sc.originUrl!, + initialUrl: sc.originUrl!, includes: sc.crawlerOptions?.includes ?? [], excludes: sc.crawlerOptions?.excludes ?? [], maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 3ea976d6..831dec6b 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) { if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!sc.cancelled) { - const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata?.url ?? doc.metadata?.sourceURL ?? undefined); + const crawler = crawlToCrawler(job.data.crawl_id, sc); const links = crawler.filterLinks( crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),