fix(crawl): redirect rebase
Some checks are pending
Deploy Images to GHCR / push-app-image (push) Waiting to run

This commit is contained in:
Gergő Móricz 2024-11-13 21:38:44 +01:00
parent 0d1c4e4e09
commit 0310cd2afa
3 changed files with 6 additions and 3 deletions

View File

@ -166,10 +166,11 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
return res; return res;
} }
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: id, jobId: id,
initialUrl: sc.originUrl!, initialUrl: sc.originUrl!,
baseUrl: newBase ? new URL(newBase).origin : undefined,
includes: sc.crawlerOptions?.includes ?? [], includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [], excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,

View File

@ -27,6 +27,7 @@ export class WebCrawler {
constructor({ constructor({
jobId, jobId,
initialUrl, initialUrl,
baseUrl,
includes, includes,
excludes, excludes,
maxCrawledLinks = 10000, maxCrawledLinks = 10000,
@ -38,6 +39,7 @@ export class WebCrawler {
}: { }: {
jobId: string; jobId: string;
initialUrl: string; initialUrl: string;
baseUrl?: string;
includes?: string[]; includes?: string[];
excludes?: string[]; excludes?: string[];
maxCrawledLinks?: number; maxCrawledLinks?: number;
@ -49,7 +51,7 @@ export class WebCrawler {
}) { }) {
this.jobId = jobId; this.jobId = jobId;
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin; this.baseUrl = baseUrl ?? new URL(initialUrl).origin;
this.includes = Array.isArray(includes) ? includes : []; this.includes = Array.isArray(includes) ? includes : [];
this.excludes = Array.isArray(excludes) ? excludes : []; this.excludes = Array.isArray(excludes) ? excludes : [];
this.limit = limit; this.limit = limit;

View File

@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) {
if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
if (!sc.cancelled) { if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc); const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);
const links = crawler.filterLinks( const links = crawler.filterLinks(
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string), crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),