fix(crawl/maxDepth): fix maxDepth behaviour

This commit is contained in:
Gergő Móricz 2024-11-11 22:02:17 +01:00
parent 7d576d13bf
commit 68c9615f2d
2 changed files with 4 additions and 3 deletions

View File

@ -3,6 +3,7 @@ import { ScrapeOptions } from "../controllers/v1/types";
import { WebCrawler } from "../scraper/WebScraper/crawler";
import { redisConnection } from "../services/queue-service";
import { logger } from "./logger";
import { getAdjustedMaxDepth } from "../scraper/WebScraper/utils/maxDepthUtils";
export type StoredCrawl = {
originUrl?: string;
@ -172,7 +173,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10,
maxCrawledDepth: getAdjustedMaxDepth(sc.originUrl!, sc.crawlerOptions?.maxDepth ?? 10),
limit: sc.crawlerOptions?.limit ?? 10000,
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,

View File

@ -7,6 +7,6 @@ export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number
}
export function getURLDepth(url: string): number {
const pathSplits = new URL(url).pathname.split('/');
return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1;
const pathSplits = new URL(url).pathname.split('/').filter(x => x !== "" && x !== "index.php" && x !== "index.html");
return pathSplits.length;
}