Merge pull request #237 from mendableai/nsc/custom-vanta-refactor

Refactor custom scraping and added security center vanta
This commit is contained in:
Nicolas 2024-06-04 12:28:30 -07:00 committed by GitHub
commit a547f9a78e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 54 additions and 22 deletions

View File

@ -1,4 +1,3 @@
import Turndown from "turndown";
import OpenAI from "openai";
import Ajv from "ajv";
const ajv = new Ajv(); // Initialize AJV for JSON schema validation

View File

@ -0,0 +1,45 @@
export async function handleCustomScraping(
text: string,
url: string
): Promise<{ scraper: string; url: string; wait_after_load: number } | null> {
// Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) {
console.log(
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
);
return {
scraper: "fire-engine",
url: url,
wait_after_load: 1000,
};
}
// Check for Vanta security portals
if (text.includes('<link href="https://static.vanta.com')) {
console.log(
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
);
return {
scraper: "fire-engine",
url: url,
wait_after_load: 3000,
};
}
// Check for Google Drive PDF links in the raw HTML
const googleDrivePdfPattern =
/https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
const googleDrivePdfLink = text.match(googleDrivePdfPattern);
if (googleDrivePdfLink) {
console.log(
`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`
);
return {
scraper: "fire-engine",
url: url,
wait_after_load: 1000,
};
}
return null;
}

View File

@ -7,6 +7,7 @@ import { parseMarkdown } from "../../lib/html-to-markdown";
import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { handleCustomScraping } from "./custom/handleCustomScraping";
dotenv.config();
@ -253,28 +254,8 @@ function getScrapingFallbackOrder(
return scrapersInOrder as (typeof baseScrapers)[number][];
}
async function handleCustomScraping(
text: string,
url: string
): Promise<FireEngineResponse | null> {
// Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"')) {
console.log(
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
);
return await scrapWithFireEngine(url, 1000);
}
// Check for Google Drive PDF links in the raw HTML
const googleDrivePdfPattern = /https:\/\/drive\.google\.com\/file\/d\/[^\/]+\/view/;
const googleDrivePdfLink = text.match(googleDrivePdfPattern);
if (googleDrivePdfLink) {
console.log(`Google Drive PDF link detected for ${url}: ${googleDrivePdfLink[0]}`);
return await scrapWithFireEngine(url, 1000);
}
return null;
}
export async function scrapSingleUrl(
urlToScrap: string,
@ -345,8 +326,15 @@ export async function scrapSingleUrl(
break;
}
let customScrapedContent : FireEngineResponse | null = null;
// Check for custom scraping conditions
const customScrapedContent = await handleCustomScraping(text, url);
const customScraperResult = await handleCustomScraping(text, url);
if(customScraperResult){
customScrapedContent = await scrapWithFireEngine(customScraperResult.url, customScraperResult.wait_after_load)
}
if (customScrapedContent) {
text = customScrapedContent.html;
screenshot = customScrapedContent.screenshot;