Merge pull request #237 from mendableai/nsc/custom-vanta-refactor

Refactor custom scraping and added security center vanta
2024-11-16 03:32:22 +08:00 · 2024-06-04 12:28:30 -07:00 · 2024-06-04 12:28:30 -07:00 · a547f9a78e
commit a547f9a78e
parent de049a5ae1 96257b7b17
3 changed files with 54 additions and 22 deletions
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@ -1,4 +1,3 @@
-import Turndown from "turndown";
 import OpenAI from "openai";
 import Ajv from "ajv";
 const ajv = new Ajv(); // Initialize AJV for JSON schema validation
--- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
+++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts
@ -0,0 +1,45 @@
+export async function handleCustomScraping(
+  text: string,
+  url: string
+): Promise<{ scraper: string; url: string; wait_after_load: number } | null> {
+  // Check for Readme Docs special case
+  if (text.includes('<meta name="readme-deploy"')) {
+    console.log(
+      `Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
+    );
+    return {
+      scraper: "fire-engine",
+      url: url,
+      wait_after_load: 1000,
+    };
+  }
+
+  // Check for Vanta security portals
+  if (text.includes('<link href="https://static.vanta.com')) {
+    console.log(
+      `Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
+    );
+    return {
+      scraper: "fire-engine",
+      url: url,
+      wait_after_load: 3000,
+    };
+  }
+
+  // Check for Google Drive PDF links in the raw HTML
+  const googleDrivePdfPattern =
+    /https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
+  const googleDrivePdfLink = text.match(googleDrivePdfPattern);
+  if (googleDrivePdfLink) {
+    console.log(
+      `Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
+    );
+    return {
+      scraper: "fire-engine",
+      url: url,
+      wait_after_load: 1000,
+    };
+  }
+
+  return null;
+}
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -7,6 +7,7 @@ import { parseMarkdown } from "../../lib/html-to-markdown";
 import { excludeNonMainTags } from "./utils/excludeTags";
 import { urlSpecificParams } from "./utils/custom/website_params";
 import { fetchAndProcessPdf } from "./utils/pdfProcessor";
+import { handleCustomScraping } from "./custom/handleCustomScraping";

 dotenv.config();

@ -253,28 +254,8 @@ function getScrapingFallbackOrder(
  return scrapersInOrder as (typeof baseScrapers)[number][];
 }

-async function handleCustomScraping(
-  text: string,
-  url: string
-): Promise<FireEngineResponse | null> {
-  // Check for Readme Docs special case
-  if (text.includes('<meta name="readme-deploy"')) {
-    console.log(
-      `Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
-    );
-    return await scrapWithFireEngine(url, 1000);
-  }

-  // Check for Google Drive PDF links in the raw HTML
-  const googleDrivePdfPattern = /https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
-  const googleDrivePdfLink = text.match(googleDrivePdfPattern);
-  if (googleDrivePdfLink) {
-    console.log(`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`);
-    return await scrapWithFireEngine(url, 1000);
-  }

-  return null;
-}

 export async function scrapSingleUrl(
  urlToScrap: string,
@ -345,8 +326,15 @@ export async function scrapSingleUrl(
        break;
    }

+    let customScrapedContent : FireEngineResponse | null = null;
+
    // Check for custom scraping conditions
-    const customScrapedContent = await handleCustomScraping(text, url);
+    const customScraperResult = await handleCustomScraping(text, url);
+
+    if(customScraperResult){
+      customScrapedContent  = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
+    }
+
    if (customScrapedContent) {
      text = customScrapedContent.html;
      screenshot = customScrapedContent.screenshot;