Cleaned up

2024-11-16 03:32:22 +08:00 · 2024-05-13 16:13:10 -03:00 · 2024-05-13 16:13:10 -03:00 · 8eb2e95f19
commit 8eb2e95f19
parent f4348024c6
3 changed files with 18 additions and 65 deletions
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -10,7 +10,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
 import { WebCrawler } from "./crawler";
 import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/imageDescription";
-import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
+import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 import {
  replaceImgPathsWithAbsolutePaths,
  replacePathsWithAbsolutePaths,
@ -144,11 +144,7 @@ export class WebScraperDataProvider {
      return this.returnOnlyUrlsResponse(links, inProgress);
    }

-    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
-    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);    
-
    let documents = await this.processLinks(links, inProgress);
-    // documents.push(...pdfDocuments);
    return this.cacheAndFinalizeDocuments(documents, links);
  }

@ -156,11 +152,8 @@ export class WebScraperDataProvider {
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
    const links = this.urls;
-    // const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
-    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);

    let documents = await this.processLinks(links, inProgress);
-    // documents.push(...pdfDocuments);
    return documents;
  }

@ -172,11 +165,7 @@ export class WebScraperDataProvider {
      return this.returnOnlyUrlsResponse(links, inProgress);
    }

-    // let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
-    // const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
-
    let documents = await this.processLinks(links, inProgress);
-    // documents.push(...pdfDocuments);
    return this.cacheAndFinalizeDocuments(documents, links);
  }

@ -233,19 +222,6 @@ export class WebScraperDataProvider {
    );
  }

-  private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
-    const checks = links.map(async (link) => ({
-      link,
-      isPdf: await isUrlAPdf({ url: link })
-    }));
-  
-    const results = await Promise.all(checks);
-    const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
-    const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
-  
-    return [pdfLinks, notPdfLinks];
-  }
-
  private applyPathReplacements(documents: Document[]): Document[] {
    return this.replaceAllPathsWithAbsolutePaths
      ? replacePathsWithAbsolutePaths(documents)
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -67,13 +67,11 @@ export async function scrapWithScrapingBee(
      );
      return "";
    }
-    // Check the content type of the response
+    
    const contentType = response.headers['content-type'];
    if (contentType && contentType.includes('application/pdf')) {
-      // Handle PDF content type
      return fetchAndProcessPdf(url);
    } else {
-      // Assume the content is text and decode it
      const decoder = new TextDecoder();
      const text = decoder.decode(response.data);
      return text;
@ -104,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
      return "";
    }

-    const data = await response.json();
-    const html = data.content;
-    return html ?? "";
+    const contentType = response.headers['content-type'];
+    if (contentType && contentType.includes('application/pdf')) {
+      return fetchAndProcessPdf(url);
+    } else {
+      const data = await response.json();
+      const html = data.content;
+      return html ?? "";
+    }
  } catch (error) {
    console.error(`Error scraping with Puppeteer: ${error}`);
    return "";
@ -173,7 +176,13 @@ export async function scrapSingleUrl(
            );
            return "";
          }
-          text = await response.text();
+
+          const contentType = response.headers['content-type'];
+          if (contentType && contentType.includes('application/pdf')) {
+            return fetchAndProcessPdf(url);
+          } else {
+            text = await response.text();
+          }
        } catch (error) {
          console.error(`Error scraping URL: ${error}`);
          return "";
--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@ -105,36 +105,4 @@ async function processPdf(file: string) {
  const fileContent = fs.readFileSync(file);
  const data = await pdf(fileContent);
  return data.text;
-}
-/**
- * Check if a url is a pdf
- * @param url The url to check
- * @param fastMode If true, the function will return false if the url is does not end with .pdf
- * @returns A promise that resolves to true if the url is a pdf, false otherwise
- */
-export async function isUrlAPdf({
-  url,
-  fastMode = false,
-}: {
-  url: string;
-  fastMode?: boolean;
-}): Promise<boolean> {
-  try {
-    if (url.endsWith(".pdf")) {
-      return true;
-    }
-    // If fast mode is enabled, we skip the HEAD request and return false
-    if (fastMode) {
-      return false;
-    }
-    const before = Date.now();
-    const response = await axios.head(url);
-    const after = Date.now();
-    console.log(`${after - before}ms - HEAD Request for ${url}`);
-    const contentType = response.headers['content-type'];
-    return contentType.includes('application/pdf');
-  } catch (error) {
-    // console.error("Error making HEAD request:", error);
-    return false;
-  }
-}
+}