Update index.ts

2024-11-16 03:32:22 +08:00 · 2024-04-17 12:51:12 -07:00 · 2024-04-17 12:51:12 -07:00 · 871d5d91b0
commit 871d5d91b0
parent 2eb81545fa
1 changed files with 190 additions and 94 deletions
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -49,19 +49,21 @@ export class WebScraperDataProvider {
    const results: (Document | null)[] = new Array(urls.length).fill(null);
    for (let i = 0; i < urls.length; i += this.concurrentRequests) {
      const batchUrls = urls.slice(i, i + this.concurrentRequests);
-      await Promise.all(batchUrls.map(async (url, index) => {
+      await Promise.all(
-        const result = await scrapSingleUrl(url, true);
+        batchUrls.map(async (url, index) => {
-        processedUrls++;
+          const result = await scrapSingleUrl(url, true);
-        if (inProgress) {
+          processedUrls++;
-          inProgress({
+          if (inProgress) {
-            current: processedUrls,
+            inProgress({
-            total: totalUrls,
+              current: processedUrls,
-            status: "SCRAPING",
+              total: totalUrls,
-            currentDocumentUrl: url,
+              status: "SCRAPING",
-          });
+              currentDocumentUrl: url,
-        }
+            });
-        results[i + index] = result;
+          }
-      }));
+          results[i + index] = result;
        })
      );
    }
    return results.filter((result) => result !== null) as Document[];
  }
@ -102,33 +104,58 @@ export class WebScraperDataProvider {
        // CACHING DOCUMENTS
        // - parent document
-        const cachedParentDocumentString = await getValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0]));
+        const cachedParentDocumentString = await getValue(
          "web-scraper-cache:" + this.normalizeUrl(this.urls[0])
        );
        if (cachedParentDocumentString != null) {
          let cachedParentDocument = JSON.parse(cachedParentDocumentString);
-          if (!cachedParentDocument.childrenLinks || cachedParentDocument.childrenLinks.length < links.length - 1) {
+          if (
-            cachedParentDocument.childrenLinks = links.filter((link) => link !== this.urls[0]);
+            !cachedParentDocument.childrenLinks ||
-            await setValue('web-scraper-cache:' + this.normalizeUrl(this.urls[0]), JSON.stringify(cachedParentDocument), 60 * 60 * 24 * 10); // 10 days
+            cachedParentDocument.childrenLinks.length < links.length - 1
          ) {
            cachedParentDocument.childrenLinks = links.filter(
              (link) => link !== this.urls[0]
            );
            await setValue(
              "web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
              JSON.stringify(cachedParentDocument),
              60 * 60 * 24 * 10
            ); // 10 days
          }
        } else {
-          let parentDocument = documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) === this.normalizeUrl(this.urls[0]))
+          let parentDocument = documents.filter(
            (document) =>
              this.normalizeUrl(document.metadata.sourceURL) ===
              this.normalizeUrl(this.urls[0])
          );
          await this.setCachedDocuments(parentDocument, links);
        }
-        await this.setCachedDocuments(documents.filter((document) => this.normalizeUrl(document.metadata.sourceURL) !== this.normalizeUrl(this.urls[0])), []);
+        await this.setCachedDocuments(
          documents.filter(
            (document) =>
              this.normalizeUrl(document.metadata.sourceURL) !==
              this.normalizeUrl(this.urls[0])
          ),
          []
        );
        documents = this.removeChildLinks(documents);
        documents = documents.splice(0, this.limit);
        return documents;
      }
      if (this.mode === "single_urls") {
-        let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
+        let documents = await this.convertUrlsToDocuments(
          this.urls,
          inProgress
        );
        documents = this.replaceImgPathsWithAbsolutePaths(documents);
        if (this.generateImgAltText) {
          documents = await this.generatesImgAltText(documents);
        }
        const baseUrl = new URL(this.urls[0]).origin;
        documents = await this.getSitemapData(baseUrl, documents);
-        
+
        await this.setCachedDocuments(documents);
        documents = this.removeChildLinks(documents);
        documents = documents.splice(0, this.limit);
@ -136,14 +163,17 @@ export class WebScraperDataProvider {
      }
      if (this.mode === "sitemap") {
        const links = await getLinksFromSitemap(this.urls[0]);
-        let documents = await this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
+        let documents = await this.convertUrlsToDocuments(
          links.slice(0, this.limit),
          inProgress
        );
        documents = await this.getSitemapData(this.urls[0], documents);
        documents = this.replaceImgPathsWithAbsolutePaths(documents);
        if (this.generateImgAltText) {
          documents = await this.generatesImgAltText(documents);
        }
-        
+
        await this.setCachedDocuments(documents);
        documents = this.removeChildLinks(documents);
        documents = documents.splice(0, this.limit);
@ -153,11 +183,22 @@ export class WebScraperDataProvider {
      return [];
    }
-    let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit));
+    let documents = await this.getCachedDocuments(
      this.urls.slice(0, this.limit)
    );
    if (documents.length < this.limit) {
-       const newDocuments: Document[] = await this.getDocuments(false, inProgress);
+      const newDocuments: Document[] = await this.getDocuments(
-      newDocuments.forEach(doc => {
+        false,
-        if (!documents.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) {
+        inProgress
      );
      newDocuments.forEach((doc) => {
        if (
          !documents.some(
            (d) =>
              this.normalizeUrl(d.metadata.sourceURL) ===
              this.normalizeUrl(doc.metadata?.sourceURL)
          )
        ) {
          documents.push(doc);
        }
      });
@ -173,17 +214,23 @@ export class WebScraperDataProvider {
      const url = new URL(document.metadata.sourceURL);
      const path = url.pathname;
-      if (this.excludes.length > 0 && this.excludes[0] !== '') {
+      if (this.excludes.length > 0 && this.excludes[0] !== "") {
        // Check if the link should be excluded
-        if (this.excludes.some(excludePattern => new RegExp(excludePattern).test(path))) {
+        if (
          this.excludes.some((excludePattern) =>
            new RegExp(excludePattern).test(path)
          )
        ) {
          return false;
        }
      }
-      
+
-      if (this.includes.length > 0 && this.includes[0] !== '') {
+      if (this.includes.length > 0 && this.includes[0] !== "") {
        // Check if the link matches the include patterns, if any are specified
        if (this.includes.length > 0) {
-          return this.includes.some(includePattern => new RegExp(includePattern).test(path));
+          return this.includes.some((includePattern) =>
            new RegExp(includePattern).test(path)
          );
        }
      }
      return true;
@ -200,7 +247,7 @@ export class WebScraperDataProvider {
  private removeChildLinks(documents: Document[]): Document[] {
    for (let document of documents) {
      if (document?.childrenLinks) delete document.childrenLinks;
-    };
+    }
    return documents;
  }
@ -210,10 +257,14 @@ export class WebScraperDataProvider {
        continue;
      }
      const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
-      await setValue('web-scraper-cache:' + normalizedUrl, JSON.stringify({
+      await setValue(
-        ...document,
+        "web-scraper-cache:" + normalizedUrl,
-        childrenLinks: childrenLinks || []
+        JSON.stringify({
-      }), 60 * 60 * 24 * 10); // 10 days
+          ...document,
          childrenLinks: childrenLinks || [],
        }),
        60 * 60 * 24 * 10
      ); // 10 days
    }
  }
@ -221,8 +272,12 @@ export class WebScraperDataProvider {
    let documents: Document[] = [];
    for (const url of urls) {
      const normalizedUrl = this.normalizeUrl(url);
-      console.log("Getting cached document for web-scraper-cache:" + normalizedUrl)
+      console.log(
-      const cachedDocumentString = await getValue('web-scraper-cache:' + normalizedUrl);
+        "Getting cached document for web-scraper-cache:" + normalizedUrl
      );
      const cachedDocumentString = await getValue(
        "web-scraper-cache:" + normalizedUrl
      );
      if (cachedDocumentString) {
        const cachedDocument = JSON.parse(cachedDocumentString);
        documents.push(cachedDocument);
@ -230,10 +285,18 @@ export class WebScraperDataProvider {
        // get children documents
        for (const childUrl of cachedDocument.childrenLinks) {
          const normalizedChildUrl = this.normalizeUrl(childUrl);
-          const childCachedDocumentString = await getValue('web-scraper-cache:' + normalizedChildUrl);
+          const childCachedDocumentString = await getValue(
            "web-scraper-cache:" + normalizedChildUrl
          );
          if (childCachedDocumentString) {
            const childCachedDocument = JSON.parse(childCachedDocumentString);
-            if (!documents.find((doc) => doc.metadata.sourceURL === childCachedDocument.metadata.sourceURL)) {
+            if (
              !documents.find(
                (doc) =>
                  doc.metadata.sourceURL ===
                  childCachedDocument.metadata.sourceURL
              )
            ) {
              documents.push(childCachedDocument);
            }
          }
@ -248,7 +311,7 @@ export class WebScraperDataProvider {
      throw new Error("Urls are required");
    }
-    console.log("options", options.crawlerOptions?.excludes)
+    console.log("options", options.crawlerOptions?.excludes);
    this.urls = options.urls;
    this.mode = options.mode;
    this.concurrentRequests = options.concurrentRequests ?? 20;
@ -257,13 +320,12 @@ export class WebScraperDataProvider {
    this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
    this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
    this.limit = options.crawlerOptions?.limit ?? 10000;
-    this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false;
+    this.generateImgAltText =
-
+      options.crawlerOptions?.generateImgAltText ?? false;
    //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
-    this.excludes = this.excludes.filter(item => item !== '');
+    this.excludes = this.excludes.filter((item) => item !== "");
-  
+
    // make sure all urls start with https://
    this.urls = this.urls.map((url) => {
      if (!url.trim().startsWith("http")) {
@ -274,10 +336,14 @@ export class WebScraperDataProvider {
  }
  private async getSitemapData(baseUrl: string, documents: Document[]) {
-    const sitemapData = await fetchSitemapData(baseUrl)
+    const sitemapData = await fetchSitemapData(baseUrl);
    if (sitemapData) {
      for (let i = 0; i < documents.length; i++) {
-        const docInSitemapData = sitemapData.find((data) => this.normalizeUrl(data.loc) === this.normalizeUrl(documents[i].metadata.sourceURL))
+        const docInSitemapData = sitemapData.find(
          (data) =>
            this.normalizeUrl(data.loc) ===
            this.normalizeUrl(documents[i].metadata.sourceURL)
        );
        if (docInSitemapData) {
          let sitemapDocData: Partial<SitemapEntry> = {};
          if (docInSitemapData.changefreq) {
@ -298,52 +364,82 @@ export class WebScraperDataProvider {
    return documents;
  }
  generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
-    await Promise.all(documents.map(async (document) => {
+    await Promise.all(
-      const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
+      documents.map(async (document) => {
        const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
-      await Promise.all(images.map(async (image: string) => {
+        await Promise.all(
-        let imageUrl = image.match(/\(([^)]+)\)/)[1];
+          images.map(async (image: string) => {
-        let altText = image.match(/\[(.*?)\]/)[1];
+            let imageUrl = image.match(/\(([^)]+)\)/)[1];
            let altText = image.match(/\[(.*?)\]/)[1];
-        if (!altText && !imageUrl.startsWith("data:image") && /\.(png|jpeg|gif|webp)$/.test(imageUrl)) {
+            if (
-          const imageIndex = document.content.indexOf(image);
+              !altText &&
-          const contentLength = document.content.length;
+              !imageUrl.startsWith("data:image") &&
-          let backText = document.content.substring(imageIndex + image.length, Math.min(imageIndex + image.length + 1000, contentLength));
+              /\.(png|jpeg|gif|webp)$/.test(imageUrl)
-          let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
+            ) {
-          let frontText = document.content.substring(frontTextStartIndex, imageIndex);
+              const imageIndex = document.content.indexOf(image);
-          altText = await getImageDescription(imageUrl, backText, frontText);
+              const contentLength = document.content.length;
-        }
+              let backText = document.content.substring(
-
+                imageIndex + image.length,
-        document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
+                Math.min(imageIndex + image.length + 1000, contentLength)
-      }));
+              );
-    }));
+              let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
-
+              let frontText = document.content.substring(
-    return documents;
+                frontTextStartIndex,
-  }
+                imageIndex
-  
+              );
-  replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
+              altText = await getImageDescription(
-    documents.forEach(document => {
+                imageUrl,
-      const baseUrl = new URL(document.metadata.sourceURL).origin;
+                backText,
-      const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
+                frontText
-  
+              );
      images.forEach(image => {
        let imageUrl = image.match(/\(([^)]+)\)/)[1];
        let altText = image.match(/\[(.*?)\]/)[1];
        if (!imageUrl.startsWith("data:image")) {
          if (!imageUrl.startsWith("http")) {
            if (imageUrl.startsWith("/")) {
              imageUrl = imageUrl.substring(1);
            }
            imageUrl = new URL(imageUrl, baseUrl).toString();
          }
        }
        document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
      });
    });
    return documents;
  }
 }
            document.content = document.content.replace(
              image,
              `![${altText}](${imageUrl})`
            );
          })
        );
      })
    );
    return documents;
  };
  replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
    try {
      documents.forEach((document) => {
        const baseUrl = new URL(document.metadata.sourceURL).origin;
        const images =
          document.content.match(
            /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
          ) || [];
        images.forEach((image: string) => {
          let imageUrl = image.match(/\(([^)]+)\)/)[1];
          let altText = image.match(/\[(.*?)\]/)[1];
          if (!imageUrl.startsWith("data:image")) {
            if (!imageUrl.startsWith("http")) {
              if (imageUrl.startsWith("/")) {
                imageUrl = imageUrl.substring(1);
              }
              imageUrl = new URL(imageUrl, baseUrl).toString();
            }
          }
          document.content = document.content.replace(
            image,
            `![${altText}](${imageUrl})`
          );
        });
      });
      return documents;
    } catch (error) {
      return documents;
    }
  };
 }