diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index b5936ad6..2b255971 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -166,10 +166,11 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro return res; } -export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { +export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler { const crawler = new WebCrawler({ jobId: id, initialUrl: sc.originUrl!, + baseUrl: newBase ? new URL(newBase).origin : undefined, includes: sc.crawlerOptions?.includes ?? [], excludes: sc.crawlerOptions?.excludes ?? [], maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index e5a25f37..7b4a97d9 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -27,6 +27,7 @@ export class WebCrawler { constructor({ jobId, initialUrl, + baseUrl, includes, excludes, maxCrawledLinks = 10000, @@ -38,6 +39,7 @@ export class WebCrawler { }: { jobId: string; initialUrl: string; + baseUrl?: string; includes?: string[]; excludes?: string[]; maxCrawledLinks?: number; @@ -49,7 +51,7 @@ export class WebCrawler { }) { this.jobId = jobId; this.initialUrl = initialUrl; - this.baseUrl = new URL(initialUrl).origin; + this.baseUrl = baseUrl ?? new URL(initialUrl).origin; this.includes = Array.isArray(includes) ? includes : []; this.excludes = Array.isArray(excludes) ? excludes : []; this.limit = limit; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 831dec6b..33b2ca9a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) { if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!sc.cancelled) { - const crawler = crawlToCrawler(job.data.crawl_id, sc); + const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl); const links = crawler.filterLinks( crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),