mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-15 19:22:19 +08:00
fix(crawl/maxDepth): fix maxDepth behaviour
This commit is contained in:
parent
7d576d13bf
commit
68c9615f2d
|
@ -3,6 +3,7 @@ import { ScrapeOptions } from "../controllers/v1/types";
|
|||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||
import { redisConnection } from "../services/queue-service";
|
||||
import { logger } from "./logger";
|
||||
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
|
||||
|
||||
export type StoredCrawl = {
|
||||
originUrl?: string;
|
||||
|
@ -172,7 +173,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
|
|||
includes: sc.crawlerOptions?.includes ?? [],
|
||||
excludes: sc.crawlerOptions?.excludes ?? [],
|
||||
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
|
||||
maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10,
|
||||
maxCrawledDepth: getAdjustedMaxDepth(sc.originUrl!, sc.crawlerOptions?.maxDepth ?? 10),
|
||||
limit: sc.crawlerOptions?.limit ?? 10000,
|
||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||
|
|
|
@ -7,6 +7,6 @@ export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number
|
|||
}
|
||||
|
||||
export function getURLDepth(url: string): number {
|
||||
const pathSplits = new URL(url).pathname.split('/');
|
||||
return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1;
|
||||
const pathSplits = new URL(url).pathname.split('/').filter(x => x !== "" && x !== "index.php" && x !== "index.html");
|
||||
return pathSplits.length;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user