From 674500affae4bf7cd044886d94abe2b5c31ee44e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 4 Jun 2024 12:15:39 -0700 Subject: [PATCH 1/2] Nick: --- apps/api/src/lib/LLM-extraction/index.ts | 1 - .../WebScraper/custom/handleCustomScraping.ts | 44 +++++++++++++++++++ apps/api/src/scraper/WebScraper/single_url.ts | 30 ++++--------- 3 files changed, 53 insertions(+), 22 deletions(-) create mode 100644 apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index ea6ddfdf..6614dbdf 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -1,4 +1,3 @@ -import Turndown from "turndown"; import OpenAI from "openai"; import Ajv from "ajv"; const ajv = new Ajv(); // Initialize AJV for JSON schema validation diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts new file mode 100644 index 00000000..d0f79e70 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -0,0 +1,44 @@ +export async function handleCustomScraping( + text: string, + url: string +): Promise<{ scraper: string; url: string; wait_after_load: number } | null> { + // Check for Readme Docs special case + if (text.includes(' { - // Check for Readme Docs special case - if (text.includes(' Date: Tue, 4 Jun 2024 12:22:46 -0700 Subject: [PATCH 2/2] Update handleCustomScraping.ts --- apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index d0f79e70..5f6c34fe 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -14,6 +14,7 @@ export async function handleCustomScraping( }; } + // Check for Vanta security portals if (text.includes('