From 7f084c6c438158a052a7e0a4db53b7a591dc068d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 14 Nov 2024 17:44:32 -0500 Subject: [PATCH] Nick: --- apps/api/src/controllers/v1/map.ts | 4 ++-- apps/api/src/scraper/WebScraper/crawler.ts | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index af97c6f1..64e0025a 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -56,7 +56,7 @@ export async function mapController( // If sitemapOnly is true, only get links from sitemap if (req.body.sitemapOnly) { - const sitemap = await crawler.tryGetSitemap(); + const sitemap = await crawler.tryGetSitemap(true, true); if (sitemap !== null) { sitemap.forEach((x) => { links.push(x.url); @@ -100,7 +100,7 @@ export async function mapController( // Parallelize sitemap fetch with serper search const [sitemap, ...searchResults] = await Promise.all([ - req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), + req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true), ...(cachedResult ? [] : pagePromises), ]); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 7b4a97d9..3fe53e4d 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -65,7 +65,12 @@ export class WebCrawler { this.allowExternalContentLinks = allowExternalContentLinks ?? false; } - public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { + public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] { + // If the initial URL is a sitemap.xml, skip filtering + if (this.initialUrl.endsWith('sitemap.xml') && fromMap) { + return sitemapLinks.slice(0, limit); + } + return sitemapLinks .filter((link) => { let url: URL; @@ -159,11 +164,14 @@ export class WebCrawler { this.robots = robotsParser(this.robotsTxtUrl, txt); } - public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> { + public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> { logger.debug(`Fetching sitemap links from ${this.initialUrl}`); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); + if(fromMap && onlySitemap) { + return sitemapLinks.map(link => ({ url: link, html: "" })); + } if (sitemapLinks.length > 0) { - let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); + let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap); return filteredLinks.map(link => ({ url: link, html: "" })); } return null; @@ -353,6 +361,7 @@ export class WebCrawler { return url; }; + const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;