Update index.ts

2024-11-16 11:42:24 +08:00 · 2024-05-04 11:53:16 -07:00 · 2024-05-04 11:53:16 -07:00 · 00373228fa
commit 00373228fa
parent ef6db3b7c2
1 changed files with 121 additions and 194 deletions
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -67,211 +67,138 @@ export class WebScraperDataProvider {
    useCaching: boolean = false,
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
-    
+    this.validateInitialUrl();
+
+    if (!useCaching) {
+      return this.processDocumentsWithoutCache(inProgress);
+    }
+
+    return this.processDocumentsWithCache(inProgress);
+  }
+
+  private validateInitialUrl(): void {
    if (this.urls[0].trim() === "") {
      throw new Error("Url is required");
    }
+  }

-    if (!useCaching) {
-      if (this.mode === "crawl") {
-        const crawler = new WebCrawler({
-          initialUrl: this.urls[0],
-          includes: this.includes,
-          excludes: this.excludes,
-          maxCrawledLinks: this.maxCrawledLinks,
-          limit: this.limit,
-          generateImgAltText: this.generateImgAltText,
-        });
-        let links = await crawler.start(inProgress, 5, this.limit);
-        if (this.returnOnlyUrls) {
-          inProgress({
-            current: links.length,
-            total: links.length,
-            status: "COMPLETED",
-            currentDocumentUrl: this.urls[0],
-          });
-          return links.map((url) => ({
-            content: "",
-            markdown: "",
-            metadata: { sourceURL: url },
-          }));
-        }
+  private async processDocumentsWithoutCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {
+    switch (this.mode) {
+      case "crawl":
+        return this.handleCrawlMode(inProgress);
+      case "single_urls":
+        return this.handleSingleUrlsMode(inProgress);
+      case "sitemap":
+        return this.handleSitemapMode(inProgress);
+      default:
+        return [];
+    }
+  }

-        let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
-        let pdfDocuments: Document[] = [];
-        for (let pdfLink of pdfLinks) {
-          const pdfContent = await fetchAndProcessPdf(pdfLink);
-          pdfDocuments.push({
-            content: pdfContent,
-            metadata: { sourceURL: pdfLink },
-            provider: "web-scraper"
-          });
-        }
-        links = links.filter((link) => !link.endsWith(".pdf"));
-
-        let documents = await this.convertUrlsToDocuments(links, inProgress);
-        documents = await this.getSitemapData(this.urls[0], documents);
-
-        if (this.replaceAllPathsWithAbsolutePaths) {
-          documents = replacePathsWithAbsolutePaths(documents);
-        } else {
-          documents = replaceImgPathsWithAbsolutePaths(documents);
-        }
-
-        if (this.generateImgAltText) {
-          documents = await this.generatesImgAltText(documents);
-        }
-        documents = documents.concat(pdfDocuments);
-
-        // CACHING DOCUMENTS
-        // - parent document
-        const cachedParentDocumentString = await getValue(
-          "web-scraper-cache:" + this.normalizeUrl(this.urls[0])
-        );
-        if (cachedParentDocumentString != null) {
-          let cachedParentDocument = JSON.parse(cachedParentDocumentString);
-          if (
-            !cachedParentDocument.childrenLinks ||
-            cachedParentDocument.childrenLinks.length < links.length - 1
-          ) {
-            cachedParentDocument.childrenLinks = links.filter(
-              (link) => link !== this.urls[0]
-            );
-            await setValue(
-              "web-scraper-cache:" + this.normalizeUrl(this.urls[0]),
-              JSON.stringify(cachedParentDocument),
-              60 * 60 * 24 * 10
-            ); // 10 days
-          }
-        } else {
-          let parentDocument = documents.filter(
-            (document) =>
-              this.normalizeUrl(document.metadata.sourceURL) ===
-              this.normalizeUrl(this.urls[0])
-          );
-          await this.setCachedDocuments(parentDocument, links);
-        }
-
-        await this.setCachedDocuments(
-          documents.filter(
-            (document) =>
-              this.normalizeUrl(document.metadata.sourceURL) !==
-              this.normalizeUrl(this.urls[0])
-          ),
-          []
-        );
-        documents = this.removeChildLinks(documents);
-        documents = documents.splice(0, this.limit);
-        return documents;
-      }
-
-      if (this.mode === "single_urls") {
-        let pdfLinks = this.urls.filter((link) => link.endsWith(".pdf"));
-        let pdfDocuments: Document[] = [];
-        for (let pdfLink of pdfLinks) {
-          const pdfContent = await fetchAndProcessPdf(pdfLink);
-          pdfDocuments.push({
-            content: pdfContent,
-            metadata: { sourceURL: pdfLink },
-            provider: "web-scraper"
-          });
-        }
-
-        let documents = await this.convertUrlsToDocuments(
-          this.urls.filter((link) => !link.endsWith(".pdf")),
-          inProgress
-        );
-
-        if (this.replaceAllPathsWithAbsolutePaths) {
-          documents = replacePathsWithAbsolutePaths(documents);
-        } else {
-          documents = replaceImgPathsWithAbsolutePaths(documents);
-        }
-
-        if (this.generateImgAltText) {
-          documents = await this.generatesImgAltText(documents);
-        }
-        const baseUrl = new URL(this.urls[0]).origin;
-        documents = await this.getSitemapData(baseUrl, documents);
-        documents = documents.concat(pdfDocuments);
-
-        if(this.extractorOptions.mode === "llm-extraction") {
-          documents = await generateCompletions(
-            documents,
-            this.extractorOptions
-          )
-        }
-
-        await this.setCachedDocuments(documents);
-        documents = this.removeChildLinks(documents);
-        documents = documents.splice(0, this.limit);
-        return documents;
-      }
-      if (this.mode === "sitemap") {
-        let links = await getLinksFromSitemap(this.urls[0]);
-        let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
-        let pdfDocuments: Document[] = [];
-        for (let pdfLink of pdfLinks) {
-          const pdfContent = await fetchAndProcessPdf(pdfLink);
-          pdfDocuments.push({
-            content: pdfContent,
-            metadata: { sourceURL: pdfLink },
-            provider: "web-scraper"
-          });
-        }
-        links = links.filter((link) => !link.endsWith(".pdf"));
-
-        let documents = await this.convertUrlsToDocuments(
-          links.slice(0, this.limit),
-          inProgress
-        );
-
-        documents = await this.getSitemapData(this.urls[0], documents);
-
-        if (this.replaceAllPathsWithAbsolutePaths) {
-          documents = replacePathsWithAbsolutePaths(documents);
-        } else {
-          documents = replaceImgPathsWithAbsolutePaths(documents);
-        }
-
-        if (this.generateImgAltText) {
-          documents = await this.generatesImgAltText(documents);
-        }
-        documents = documents.concat(pdfDocuments);
-
-        await this.setCachedDocuments(documents);
-        documents = this.removeChildLinks(documents);
-        documents = documents.splice(0, this.limit);
-        return documents;
-      }
-
-      return [];
+  private async handleCrawlMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
+    const crawler = new WebCrawler({
+      initialUrl: this.urls[0],
+      includes: this.includes,
+      excludes: this.excludes,
+      maxCrawledLinks: this.maxCrawledLinks,
+      limit: this.limit,
+      generateImgAltText: this.generateImgAltText,
+    });
+    let links = await crawler.start(inProgress, 5, this.limit);
+    if (this.returnOnlyUrls) {
+      return this.returnOnlyUrlsResponse(links, inProgress);
    }

-    let documents = await this.getCachedDocuments(
-      this.urls.slice(0, this.limit)
-    );
+    let documents = await this.processLinks(links, inProgress);
+    return this.cacheAndFinalizeDocuments(documents, links);
+  }
+
+  private async handleSingleUrlsMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
+    let documents = await this.convertUrlsToDocuments(this.urls, inProgress);
+    documents = await this.applyPathReplacements(documents);
+    documents = await this.applyImgAltText(documents);
+    return documents;
+  }
+
+  private async handleSitemapMode(inProgress?: (progress: Progress) => void): Promise<Document[]> {
+    let links = await getLinksFromSitemap(this.urls[0]);
+    if (this.returnOnlyUrls) {
+      return this.returnOnlyUrlsResponse(links, inProgress);
+    }
+
+    let documents = await this.processLinks(links, inProgress);
+    return this.cacheAndFinalizeDocuments(documents, links);
+  }
+
+  private async returnOnlyUrlsResponse(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {
+    inProgress?.({
+      current: links.length,
+      total: links.length,
+      status: "COMPLETED",
+      currentDocumentUrl: this.urls[0],
+    });
+    return links.map(url => ({
+      content: "",
+      markdown: "",
+      metadata: { sourceURL: url },
+    }));
+  }
+
+  private async processLinks(links: string[], inProgress?: (progress: Progress) => void): Promise<Document[]> {
+    let pdfLinks = links.filter(link => link.endsWith(".pdf"));
+    let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
+    links = links.filter(link => !link.endsWith(".pdf"));
+
+    let documents = await this.convertUrlsToDocuments(links, inProgress);
+    documents = await this.getSitemapData(this.urls[0], documents);
+    documents = this.applyPathReplacements(documents);
+    documents = await this.applyImgAltText(documents);
+    return documents.concat(pdfDocuments);
+  }
+
+  private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
+    return Promise.all(pdfLinks.map(async pdfLink => {
+      const pdfContent = await fetchAndProcessPdf(pdfLink);
+      return {
+        content: pdfContent,
+        metadata: { sourceURL: pdfLink },
+        provider: "web-scraper"
+      };
+    }));
+  }
+
+  private applyPathReplacements(documents: Document[]): Document[] {
+    return this.replaceAllPathsWithAbsolutePaths ? replacePathsWithAbsolutePaths(documents) : replaceImgPathsWithAbsolutePaths(documents);
+  }
+
+  private async applyImgAltText(documents: Document[]): Promise<Document[]> {
+    return this.generateImgAltText ? this.generatesImgAltText(documents) : documents;
+  }
+
+  private async cacheAndFinalizeDocuments(documents: Document[], links: string[]): Promise<Document[]> {
+    await this.setCachedDocuments(documents, links);
+    documents = this.removeChildLinks(documents);
+    return documents.splice(0, this.limit);
+  }
+
+  private async processDocumentsWithCache(inProgress?: (progress: Progress) => void): Promise<Document[]> {
+    let documents = await this.getCachedDocuments(this.urls.slice(0, this.limit));
    if (documents.length < this.limit) {
-      const newDocuments: Document[] = await this.getDocuments(
-        false,
-        inProgress
-      );
-      newDocuments.forEach((doc) => {
-        if (
-          !documents.some(
-            (d) =>
-              this.normalizeUrl(d.metadata.sourceURL) ===
-              this.normalizeUrl(doc.metadata?.sourceURL)
-          )
-        ) {
-          documents.push(doc);
-        }
-      });
+      const newDocuments: Document[] = await this.getDocuments(false, inProgress);
+      documents = this.mergeNewDocuments(documents, newDocuments);
    }
    documents = this.filterDocsExcludeInclude(documents);
    documents = this.removeChildLinks(documents);
-    documents = documents.splice(0, this.limit);
-    return documents;
+    return documents.splice(0, this.limit);
+  }
+
+  private mergeNewDocuments(existingDocuments: Document[], newDocuments: Document[]): Document[] {
+    newDocuments.forEach(doc => {
+      if (!existingDocuments.some(d => this.normalizeUrl(d.metadata.sourceURL) === this.normalizeUrl(doc.metadata?.sourceURL))) {
+        existingDocuments.push(doc);
+      }
+    });
+    return existingDocuments;
  }

  private filterDocsExcludeInclude(documents: Document[]): Document[] {