Merge pull request #204 from mendableai/feat/custom-scraping-readme

[Feat] Added custom scraping conditions for readme docs
This commit is contained in:
Nicolas 2024-05-29 10:00:24 -07:00 committed by GitHub
commit 51b0b88cd4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -210,6 +210,14 @@ function getScrapingFallbackOrder(defaultScraper?: string, isWaitPresent: boolea
return scrapersInOrder as typeof baseScrapers[number][];
}
async function handleCustomScraping(text: string, url: string): Promise<string | null> {
if (text.includes('<meta name="readme-deploy"')) {
console.log(`Special use case detected for ${url}, using Fire Engine with wait time 1000ms`);
return await scrapWithFireEngine(url, 1000);
}
return null;
}
export async function scrapSingleUrl(
urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false, waitFor: 0},
@ -266,6 +274,12 @@ export async function scrapSingleUrl(
break;
}
// Check for custom scraping conditions
const customScrapedContent = await handleCustomScraping(text, url);
if (customScrapedContent) {
text = customScrapedContent;
}
//* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(text, pageOptions);