From 9c475d63b7310c6b607b63336470992c78089bf9 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 6 Sep 2024 08:50:42 -0300 Subject: [PATCH] feat: added fetch sitemap for yoastSEO generated sitemaps (example omnyhealth.com previously had 35 results, now 131). --- apps/api/src/scraper/WebScraper/crawler.ts | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index d5dadaf8..29d313f5 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -156,6 +156,13 @@ export class WebCrawler { if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); return filteredLinks.map(link => ({ url: link, html: "" })); + } else { + // Yoast SEO sitemap index + let sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl, "/sitemap_index.xml"); + if (sitemapLinks.length > 0) { + let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); + return filteredLinks.map(link => ({ url: link, html: "" })); + } } return null; } @@ -498,7 +505,10 @@ export class WebCrawler { } // - private async tryFetchSitemapLinks(url: string): Promise { + private async tryFetchSitemapLinks(url: string, sitemapPath?: string): Promise { + if (!sitemapPath) { + sitemapPath = "/sitemap.xml"; + } const normalizeUrl = (url: string) => { url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); if (url.endsWith("/")) { @@ -507,9 +517,9 @@ export class WebCrawler { return url; }; - const sitemapUrl = url.endsWith("/sitemap.xml") + const sitemapUrl = url.endsWith(sitemapPath) ? url - : `${url}/sitemap.xml`; + : `${url}${sitemapPath}`; let sitemapLinks: string[] = []; @@ -531,7 +541,7 @@ export class WebCrawler { } if (sitemapLinks.length === 0) { - const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; + const baseUrlSitemap = `${this.baseUrl}${sitemapPath}`; try { const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); if (response.status === 200) {