diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 80705dbd..f97230ff 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -383,7 +383,7 @@ export class WebCrawler { return linkDomain === baseDomain; } - private isFile(url: string): boolean { + public isFile(url: string): boolean { const fileExtensions = [ ".png", ".jpg", diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c9368f41..460aeca6 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -2,6 +2,7 @@ import axios from "axios"; import { axiosTimeout } from "../../lib/timeout"; import { parseStringPromise } from "xml2js"; import { scrapWithFireEngine } from "./scrapers/fireEngine"; +import { WebCrawler } from "./crawler"; export async function getLinksFromSitemap( { @@ -41,7 +42,7 @@ export async function getLinksFromSitemap( } } else if (root && root.url) { for (const url of root.url) { - if (url.loc && url.loc.length > 0) { + if (url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) { allUrls.push(url.loc[0]); } }