diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index ddc542f9..9f6a8cf4 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -3,6 +3,7 @@ import { ScrapeOptions } from "../controllers/v1/types"; import { WebCrawler } from "../scraper/WebScraper/crawler"; import { redisConnection } from "../services/queue-service"; import { logger } from "./logger"; +import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils"; export type StoredCrawl = { originUrl?: string; @@ -172,7 +173,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { includes: sc.crawlerOptions?.includes ?? [], excludes: sc.crawlerOptions?.excludes ?? [], maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, - maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10, + maxCrawledDepth: getAdjustedMaxDepth(sc.originUrl!, sc.crawlerOptions?.maxDepth ?? 10), limit: sc.crawlerOptions?.limit ?? 10000, generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false, allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false, diff --git a/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts index c1fea7fc..bcacc210 100644 --- a/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts +++ b/apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts @@ -7,6 +7,6 @@ export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number } export function getURLDepth(url: string): number { - const pathSplits = new URL(url).pathname.split('/'); - return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1; + const pathSplits = new URL(url).pathname.split('/').filter(x => x !== "" && x !== "index.php" && x !== "index.html"); + return pathSplits.length; }