From 871d5d91b0fa0d477425bbdc512edcbaa4a33f56 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 17 Apr 2024 12:51:12 -0700 Subject: [PATCH] Update index.ts --- apps/api/src/scraper/WebScraper/index.ts | 284 +++++++++++++++-------- 1 file changed, 190 insertions(+), 94 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 727b5973..ecb2fff6 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -49,19 +49,21 @@ export class WebScraperDataProvider { const results: (Document | null)[] = new Array(urls.length).fill(null); for (let i = 0; i < urls.length; i += this.concurrentRequests) { const batchUrls = urls.slice(i, i + this.concurrentRequests); - await Promise.all(batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, true); - processedUrls++; - if (inProgress) { - inProgress({ - current: processedUrls, - total: totalUrls, - status: "SCRAPING", - currentDocumentUrl: url, - }); - } - results[i + index] = result; - })); + await Promise.all( + batchUrls.map(async (url, index) => { + const result = await scrapSingleUrl(url, true); + processedUrls++; + if (inProgress) { + inProgress({ + current: processedUrls, + total: totalUrls, + status: "SCRAPING", + currentDocumentUrl: url, + }); + } + results[i + index] = result; + }) + ); } return results.filter((result) => result !== null) as Document[]; } @@ -102,33 +104,58 @@ export class WebScraperDataProvider { // CACHING DOCUMENTS // - parent document - const cachedParentDocumentString = await getValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0])); + const cachedParentDocumentString = await getValue( + "web-scraper-cache:" + this.normalizeUrl(this.urls[0]) + ); if (cachedParentDocumentString != null) { let cachedParentDocument = JSON.parse(cachedParentDocumentString); - if (!cachedParentDocument.childrenLinks || cachedParentDocument.childrenLinks.length < links.length - 1) { - cachedParentDocument.childrenLinks = links.filter((link) => link !== this.urls[0]); - await setValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0]), JSON.stringify(cachedParentDocument), 60 * 60 * 24 * 10); // 10 days + if ( + !cachedParentDocument.childrenLinks || + cachedParentDocument.childrenLinks.length < links.length - 1 + ) { + cachedParentDocument.childrenLinks = links.filter( + (link) => link !== this.urls[0] + ); + await setValue( + "web-scraper-cache:" + this.normalizeUrl(this.urls[0]), + JSON.stringify(cachedParentDocument), + 60 * 60 * 24 * 10 + ); // 10 days } } else { - let parentDocument = documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) === this.normalizeUrl(this.urls[0])) + let parentDocument = documents.filter( + (document) => + this.normalizeUrl(document.metadata.sourceURL) === + this.normalizeUrl(this.urls[0]) + ); await this.setCachedDocuments(parentDocument, links); } - await this.setCachedDocuments(documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) !== this.normalizeUrl(this.urls[0])), []); + await this.setCachedDocuments( + documents.filter( + (document) => + this.normalizeUrl(document.metadata.sourceURL) !== + this.normalizeUrl(this.urls[0]) + ), + [] + ); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); return documents; } if (this.mode === "single_urls") { - let documents = await this.convertUrlsToDocuments(this.urls, inProgress); + let documents = await this.convertUrlsToDocuments( + this.urls, + inProgress + ); documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } const baseUrl = new URL(this.urls[0]).origin; documents = await this.getSitemapData(baseUrl, documents); - + await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); @@ -136,14 +163,17 @@ export class WebScraperDataProvider { } if (this.mode === "sitemap") { const links = await getLinksFromSitemap(this.urls[0]); - let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress); + let documents = await this.convertUrlsToDocuments( + links.slice(0, this.limit), + inProgress + ); documents = await this.getSitemapData(this.urls[0], documents); documents = this.replaceImgPathsWithAbsolutePaths(documents); if (this.generateImgAltText) { documents = await this.generatesImgAltText(documents); } - + await this.setCachedDocuments(documents); documents = this.removeChildLinks(documents); documents = documents.splice(0, this.limit); @@ -153,11 +183,22 @@ export class WebScraperDataProvider { return []; } - let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit)); + let documents = await this.getCachedDocuments( + this.urls.slice(0, this.limit) + ); if (documents.length < this.limit) { - const newDocuments: Document[] = await this.getDocuments(false, inProgress); - newDocuments.forEach(doc => { - if (!documents.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) { + const newDocuments: Document[] = await this.getDocuments( + false, + inProgress + ); + newDocuments.forEach((doc) => { + if ( + !documents.some( + (d) => + this.normalizeUrl(d.metadata.sourceURL) === + this.normalizeUrl(doc.metadata?.sourceURL) + ) + ) { documents.push(doc); } }); @@ -173,17 +214,23 @@ export class WebScraperDataProvider { const url = new URL(document.metadata.sourceURL); const path = url.pathname; - if (this.excludes.length > 0 && this.excludes[0] !== '') { + if (this.excludes.length > 0 && this.excludes[0] !== "") { // Check if the link should be excluded - if (this.excludes.some(excludePattern => new RegExp(excludePattern).test(path))) { + if ( + this.excludes.some((excludePattern) => + new RegExp(excludePattern).test(path) + ) + ) { return false; } } - - if (this.includes.length > 0 && this.includes[0] !== '') { + + if (this.includes.length > 0 && this.includes[0] !== "") { // Check if the link matches the include patterns, if any are specified if (this.includes.length > 0) { - return this.includes.some(includePattern => new RegExp(includePattern).test(path)); + return this.includes.some((includePattern) => + new RegExp(includePattern).test(path) + ); } } return true; @@ -200,7 +247,7 @@ export class WebScraperDataProvider { private removeChildLinks(documents: Document[]): Document[] { for (let document of documents) { if (document?.childrenLinks) delete document.childrenLinks; - }; + } return documents; } @@ -210,10 +257,14 @@ export class WebScraperDataProvider { continue; } const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL); - await setValue('web-scraper-cache:' + normalizedUrl, JSON.stringify({ - ...document, - childrenLinks: childrenLinks || [] - }), 60 * 60 * 24 * 10); // 10 days + await setValue( + "web-scraper-cache:" + normalizedUrl, + JSON.stringify({ + ...document, + childrenLinks: childrenLinks || [], + }), + 60 * 60 * 24 * 10 + ); // 10 days } } @@ -221,8 +272,12 @@ export class WebScraperDataProvider { let documents: Document[] = []; for (const url of urls) { const normalizedUrl = this.normalizeUrl(url); - console.log("Getting cached document for web-scraper-cache:" + normalizedUrl) - const cachedDocumentString = await getValue('web-scraper-cache:' + normalizedUrl); + console.log( + "Getting cached document for web-scraper-cache:" + normalizedUrl + ); + const cachedDocumentString = await getValue( + "web-scraper-cache:" + normalizedUrl + ); if (cachedDocumentString) { const cachedDocument = JSON.parse(cachedDocumentString); documents.push(cachedDocument); @@ -230,10 +285,18 @@ export class WebScraperDataProvider { // get children documents for (const childUrl of cachedDocument.childrenLinks) { const normalizedChildUrl = this.normalizeUrl(childUrl); - const childCachedDocumentString = await getValue('web-scraper-cache:' + normalizedChildUrl); + const childCachedDocumentString = await getValue( + "web-scraper-cache:" + normalizedChildUrl + ); if (childCachedDocumentString) { const childCachedDocument = JSON.parse(childCachedDocumentString); - if (!documents.find((doc) => doc.metadata.sourceURL === childCachedDocument.metadata.sourceURL)) { + if ( + !documents.find( + (doc) => + doc.metadata.sourceURL === + childCachedDocument.metadata.sourceURL + ) + ) { documents.push(childCachedDocument); } } @@ -248,7 +311,7 @@ export class WebScraperDataProvider { throw new Error("Urls are required"); } - console.log("options", options.crawlerOptions?.excludes) + console.log("options", options.crawlerOptions?.excludes); this.urls = options.urls; this.mode = options.mode; this.concurrentRequests = options.concurrentRequests ?? 20; @@ -257,13 +320,12 @@ export class WebScraperDataProvider { this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000; this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false; this.limit = options.crawlerOptions?.limit ?? 10000; - this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; - + this.generateImgAltText = + options.crawlerOptions?.generateImgAltText ?? false; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check - this.excludes = this.excludes.filter(item => item !== ''); - - + this.excludes = this.excludes.filter((item) => item !== ""); + // make sure all urls start with https:// this.urls = this.urls.map((url) => { if (!url.trim().startsWith("http")) { @@ -274,10 +336,14 @@ export class WebScraperDataProvider { } private async getSitemapData(baseUrl: string, documents: Document[]) { - const sitemapData = await fetchSitemapData(baseUrl) + const sitemapData = await fetchSitemapData(baseUrl); if (sitemapData) { for (let i = 0; i < documents.length; i++) { - const docInSitemapData = sitemapData.find((data) => this.normalizeUrl(data.loc) === this.normalizeUrl(documents[i].metadata.sourceURL)) + const docInSitemapData = sitemapData.find( + (data) => + this.normalizeUrl(data.loc) === + this.normalizeUrl(documents[i].metadata.sourceURL) + ); if (docInSitemapData) { let sitemapDocData: Partial = {}; if (docInSitemapData.changefreq) { @@ -298,52 +364,82 @@ export class WebScraperDataProvider { return documents; } generatesImgAltText = async (documents: Document[]): Promise => { - await Promise.all(documents.map(async (document) => { - const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || []; + await Promise.all( + documents.map(async (document) => { + const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || []; - await Promise.all(images.map(async (image: string) => { - let imageUrl = image.match(/\(([^)]+)\)/)[1]; - let altText = image.match(/\[(.*?)\]/)[1]; + await Promise.all( + images.map(async (image: string) => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; - if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) { - const imageIndex = document.content.indexOf(image); - const contentLength = document.content.length; - let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength)); - let frontTextStartIndex = Math.max(imageIndex - 1000, 0); - let frontText = document.content.substring(frontTextStartIndex, imageIndex); - altText = await getImageDescription(imageUrl, backText, frontText); - } - - document.content = document.content.replace(image, `![${altText}](${imageUrl})`); - })); - })); - - return documents; - } - - replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { - documents.forEach(document => { - const baseUrl = new URL(document.metadata.sourceURL).origin; - const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || []; - - images.forEach(image => { - let imageUrl = image.match(/\(([^)]+)\)/)[1]; - let altText = image.match(/\[(.*?)\]/)[1]; - - if (!imageUrl.startsWith("data:image")) { - if (!imageUrl.startsWith("http")) { - if (imageUrl.startsWith("/")) { - imageUrl = imageUrl.substring(1); + if ( + !altText && + !imageUrl.startsWith("data:image") && + /\.(png|jpeg|gif|webp)$/.test(imageUrl) + ) { + const imageIndex = document.content.indexOf(image); + const contentLength = document.content.length; + let backText = document.content.substring( + imageIndex + image.length, + Math.min(imageIndex + image.length + 1000, contentLength) + ); + let frontTextStartIndex = Math.max(imageIndex - 1000, 0); + let frontText = document.content.substring( + frontTextStartIndex, + imageIndex + ); + altText = await getImageDescription( + imageUrl, + backText, + frontText + ); } - imageUrl = new URL(imageUrl, baseUrl).toString(); - } - } - - document.content = document.content.replace(image, `![${altText}](${imageUrl})`); - }); - }); - - return documents; - } -} + document.content = document.content.replace( + image, + `![${altText}](${imageUrl})` + ); + }) + ); + }) + ); + + return documents; + }; + + replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { + try { + documents.forEach((document) => { + const baseUrl = new URL(document.metadata.sourceURL).origin; + const images = + document.content.match( + /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g + ) || []; + + images.forEach((image: string) => { + let imageUrl = image.match(/\(([^)]+)\)/)[1]; + let altText = image.match(/\[(.*?)\]/)[1]; + + if (!imageUrl.startsWith("data:image")) { + if (!imageUrl.startsWith("http")) { + if (imageUrl.startsWith("/")) { + imageUrl = imageUrl.substring(1); + } + imageUrl = new URL(imageUrl, baseUrl).toString(); + } + } + + document.content = document.content.replace( + image, + `![${altText}](${imageUrl})` + ); + }); + }); + + return documents; + } catch (error) { + return documents; + } + }; +}