fix(crawl): redirect rebase

2024-11-16 03:32:22 +08:00 · 2024-11-13 21:38:44 +01:00 · 2024-11-13 21:38:44 +01:00 · 0310cd2afa
commit 0310cd2afa
parent 0d1c4e4e09
3 changed files with 6 additions and 3 deletions
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -166,10 +166,11 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
    return res;
 }

-export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
+export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler {
    const crawler = new WebCrawler({
        jobId: id,
        initialUrl: sc.originUrl!,
+        baseUrl: newBase ? new URL(newBase).origin : undefined,
        includes: sc.crawlerOptions?.includes ?? [],
        excludes: sc.crawlerOptions?.excludes ?? [],
        maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -27,6 +27,7 @@ export class WebCrawler {
  constructor({
    jobId,
    initialUrl,
+    baseUrl,
    includes,
    excludes,
    maxCrawledLinks = 10000,
@ -38,6 +39,7 @@ export class WebCrawler {
  }: {
    jobId: string;
    initialUrl: string;
+    baseUrl?: string;
    includes?: string[];
    excludes?: string[];
    maxCrawledLinks?: number;
@ -49,7 +51,7 @@ export class WebCrawler {
  }) {
    this.jobId = jobId;
    this.initialUrl = initialUrl;
-    this.baseUrl = new URL(initialUrl).origin;
+    this.baseUrl = baseUrl ?? new URL(initialUrl).origin;
    this.includes = Array.isArray(includes) ? includes : [];
    this.excludes = Array.isArray(excludes) ? excludes : [];
    this.limit = limit;
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) {

      if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
        if (!sc.cancelled) {
-          const crawler = crawlToCrawler(job.data.crawl_id, sc);
+          const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);

          const links = crawler.filterLinks(
            crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),