mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-15 19:22:19 +08:00
fix(crawl): redirect rebase
Some checks are pending
Deploy Images to GHCR / push-app-image (push) Waiting to run
Some checks are pending
Deploy Images to GHCR / push-app-image (push) Waiting to run
This commit is contained in:
parent
0d1c4e4e09
commit
0310cd2afa
|
@ -166,10 +166,11 @@ export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Pro
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): WebCrawler {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: id,
|
jobId: id,
|
||||||
initialUrl: sc.originUrl!,
|
initialUrl: sc.originUrl!,
|
||||||
|
baseUrl: newBase ? new URL(newBase).origin : undefined,
|
||||||
includes: sc.crawlerOptions?.includes ?? [],
|
includes: sc.crawlerOptions?.includes ?? [],
|
||||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||||
|
|
|
@ -27,6 +27,7 @@ export class WebCrawler {
|
||||||
constructor({
|
constructor({
|
||||||
jobId,
|
jobId,
|
||||||
initialUrl,
|
initialUrl,
|
||||||
|
baseUrl,
|
||||||
includes,
|
includes,
|
||||||
excludes,
|
excludes,
|
||||||
maxCrawledLinks = 10000,
|
maxCrawledLinks = 10000,
|
||||||
|
@ -38,6 +39,7 @@ export class WebCrawler {
|
||||||
}: {
|
}: {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
|
baseUrl?: string;
|
||||||
includes?: string[];
|
includes?: string[];
|
||||||
excludes?: string[];
|
excludes?: string[];
|
||||||
maxCrawledLinks?: number;
|
maxCrawledLinks?: number;
|
||||||
|
@ -49,7 +51,7 @@ export class WebCrawler {
|
||||||
}) {
|
}) {
|
||||||
this.jobId = jobId;
|
this.jobId = jobId;
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
this.baseUrl = new URL(initialUrl).origin;
|
this.baseUrl = baseUrl ?? new URL(initialUrl).origin;
|
||||||
this.includes = Array.isArray(includes) ? includes : [];
|
this.includes = Array.isArray(includes) ? includes : [];
|
||||||
this.excludes = Array.isArray(excludes) ? excludes : [];
|
this.excludes = Array.isArray(excludes) ? excludes : [];
|
||||||
this.limit = limit;
|
this.limit = limit;
|
||||||
|
|
|
@ -352,7 +352,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||||
|
|
||||||
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
||||||
if (!sc.cancelled) {
|
if (!sc.cancelled) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);
|
||||||
|
|
||||||
const links = crawler.filterLinks(
|
const links = crawler.filterLinks(
|
||||||
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
|
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user