diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index db085874..fad8fa3c 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -110,6 +110,8 @@ app.post("/v0/scrape", async (req, res) => { return res.status(400).json({ error: "Url is required" }); } + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; + try { const a = new WebScraperDataProvider(); await a.setOptions({ @@ -118,6 +120,7 @@ app.post("/v0/scrape", async (req, res) => { crawlerOptions: { ...crawlerOptions, }, + pageOptions: pageOptions, }); const docs = await a.getDocuments(false); @@ -178,6 +181,7 @@ app.post("/v0/crawl", async (req, res) => { } const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; if (mode === "single_urls" && !url.includes(",")) { try { @@ -188,6 +192,7 @@ app.post("/v0/crawl", async (req, res) => { crawlerOptions: { returnOnlyUrls: true, }, + pageOptions: pageOptions, }); const docs = await a.getDocuments(false, (progress) => { @@ -212,6 +217,8 @@ app.post("/v0/crawl", async (req, res) => { mode: mode ?? "crawl", // fix for single urls not working crawlerOptions: { ...crawlerOptions }, team_id: team_id, + pageOptions: pageOptions, + }); res.json({ jobId: job.id }); @@ -239,11 +246,13 @@ app.post("/v0/crawlWebsitePreview", async (req, res) => { } const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; + const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; const job = await addWebScraperJob({ url: url, mode: mode ?? "crawl", // fix for single urls not working crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, team_id: "preview", + pageOptions: pageOptions, }); res.json({ jobId: job.id }); diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 1e681a96..c3329142 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -9,6 +9,24 @@ export interface Progress { currentDocumentUrl?: string; } +export type PageOptions = { + onlyMainContent?: boolean; +}; +export type WebScraperOptions = { + urls: string[]; + mode: "single_urls" | "sitemap" | "crawl"; + crawlerOptions?: { + returnOnlyUrls?: boolean; + includes?: string[]; + excludes?: string[]; + maxCrawledLinks?: number; + limit?: number; + generateImgAltText?: boolean; + }; + pageOptions?: PageOptions; + concurrentRequests?: number; +}; + export class Document { id?: string; content: string; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 762e1536..c43b1b38 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -13,6 +13,7 @@ export async function startWebScraperPipeline({ url: job.data.url, mode: job.data.mode, crawlerOptions: job.data.crawlerOptions, + pageOptions: job.data.pageOptions, inProgress: (progress) => { job.progress(progress); }, @@ -29,6 +30,7 @@ export async function runWebScraper({ url, mode, crawlerOptions, + pageOptions, inProgress, onSuccess, onError, @@ -37,6 +39,7 @@ export async function runWebScraper({ url: string; mode: "crawl" | "single_urls" | "sitemap"; crawlerOptions: any; + pageOptions?: any; inProgress: (progress: any) => void; onSuccess: (result: any) => void; onError: (error: any) => void; @@ -44,18 +47,19 @@ export async function runWebScraper({ }): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> { try { const provider = new WebScraperDataProvider(); - if (mode === "crawl") { await provider.setOptions({ mode: mode, urls: [url], crawlerOptions: crawlerOptions, + pageOptions: pageOptions, }); } else { await provider.setOptions({ mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, + pageOptions: pageOptions, }); } const docs = (await provider.getDocuments(false, (progress: Progress) => { diff --git a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts index 49b39263..42d95139 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/index.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/index.test.ts @@ -13,6 +13,10 @@ describe("WebScraperDataProvider", () => { metadata: { sourceURL: "https://example.com/another-page" }, content: "![another alt text](./another-image.png)", }, + { + metadata: { sourceURL: "https://example.com/another-page" }, + content: "![another alt text](./another-image.webp)", + }, { metadata: { sourceURL: "https://example.com/data-image" }, content: "![data image](data:image/png;base64,...)", @@ -28,6 +32,10 @@ describe("WebScraperDataProvider", () => { metadata: { sourceURL: "https://example.com/another-page" }, content: "![another alt text](https://example.com/another-image.png)", }, + { + metadata: { sourceURL: "https://example.com/another-page" }, + content: "![another alt text](https://example.com/another-image.webp)", + }, { metadata: { sourceURL: "https://example.com/data-image" }, content: "![data image](data:image/png;base64,...)", diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index e1bd4250..fbfaa7bf 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -1,4 +1,4 @@ -import { Document } from "../../lib/entities"; +import { Document, PageOptions, WebScraperOptions } from "../../lib/entities"; import { Progress } from "../../lib/entities"; import { scrapSingleUrl } from "./single_url"; import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap"; @@ -6,19 +6,7 @@ import { WebCrawler } from "./crawler"; import { getValue, setValue } from "../../services/redis"; import { getImageDescription } from "./utils/gptVision"; -export type WebScraperOptions = { - urls: string[]; - mode: "single_urls" | "sitemap" | "crawl"; - crawlerOptions?: { - returnOnlyUrls?: boolean; - includes?: string[]; - excludes?: string[]; - maxCrawledLinks?: number; - limit?: number; - generateImgAltText?: boolean; - }; - concurrentRequests?: number; -}; + export class WebScraperDataProvider { private urls: string[] = [""]; private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; @@ -29,6 +17,7 @@ export class WebScraperDataProvider { private limit: number = 10000; private concurrentRequests: number = 20; private generateImgAltText: boolean = false; + private pageOptions?: PageOptions; authorize(): void { throw new Error("Method not implemented."); @@ -51,7 +40,7 @@ export class WebScraperDataProvider { const batchUrls = urls.slice(i, i + this.concurrentRequests); await Promise.all( batchUrls.map(async (url, index) => { - const result = await scrapSingleUrl(url, true); + const result = await scrapSingleUrl(url, true, this.pageOptions); processedUrls++; if (inProgress) { inProgress({ @@ -321,6 +310,7 @@ export class WebScraperDataProvider { this.limit = options.crawlerOptions?.limit ?? 10000; this.generateImgAltText = options.crawlerOptions?.generateImgAltText ?? false; + this.pageOptions = options.pageOptions ?? {onlyMainContent: false}; //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check this.excludes = this.excludes.filter((item) => item !== ""); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index f71221c8..fbcd9238 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -2,9 +2,10 @@ import * as cheerio from "cheerio"; import { ScrapingBeeClient } from "scrapingbee"; import { extractMetadata } from "./utils/metadata"; import dotenv from "dotenv"; -import { Document } from "../../lib/entities"; +import { Document, PageOptions } from "../../lib/entities"; import { parseMarkdown } from "../../lib/html-to-markdown"; import { parseTablesToMarkdown } from "./utils/parseTable"; +import { excludeNonMainTags } from "./utils/excludeTags"; // import puppeteer from "puppeteer"; dotenv.config(); @@ -77,14 +78,21 @@ export async function scrapWithPlaywright(url: string): Promise { export async function scrapSingleUrl( urlToScrap: string, - toMarkdown: boolean = true + toMarkdown: boolean = true, + pageOptions: PageOptions = { onlyMainContent: true } ): Promise { console.log(`Scraping URL: ${urlToScrap}`); urlToScrap = urlToScrap.trim(); - const removeUnwantedElements = (html: string) => { + const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const soup = cheerio.load(html); soup("script, style, iframe, noscript, meta, head").remove(); + if (pageOptions.onlyMainContent) { + // remove any other tags that are not in the main content + excludeNonMainTags.forEach((tag) => { + soup(tag).remove(); + }); + } return soup.html(); }; @@ -133,7 +141,7 @@ export async function scrapSingleUrl( } break; } - let cleanedHtml = removeUnwantedElements(text); + let cleanedHtml = removeUnwantedElements(text, pageOptions); return [await parseMarkdown(cleanedHtml), text]; }; diff --git a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts new file mode 100644 index 00000000..142bcef0 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts @@ -0,0 +1,60 @@ +export const excludeNonMainTags = [ + "header", + "footer", + "nav", + "aside", + ".header", + ".top", + ".navbar", + "#header", + ".footer", + ".bottom", + "#footer", + ".sidebar", + ".side", + ".aside", + "#sidebar", + ".modal", + ".popup", + "#modal", + ".overlay", + ".ad", + ".ads", + ".advert", + "#ad", + ".lang-selector", + ".language", + "#language-selector", + ".social", + ".social-media", + ".social-links", + "#social", + ".menu", + ".navigation", + "#nav", + ".breadcrumbs", + "#breadcrumbs", + ".form", + "form", + "#search-form", + ".search", + "#search", + ".share", + "#share", + ".pagination", + "#pagination", + ".widget", + "#widget", + ".related", + "#related", + ".tag", + "#tag", + ".category", + "#category", + ".comment", + "#comment", + ".reply", + "#reply", + ".author", + "#author", +]; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index a3de049e..2123e0c2 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -20,7 +20,9 @@ export interface WebScraperOptions { url: string; mode: "crawl" | "single_urls" | "sitemap"; crawlerOptions: any; + pageOptions: any; team_id: string; } +