Merge pull request #14 from mendableai/nsc/clean-content

Option to extract only the main content, excluding headers, navs, footers etc.
2024-11-16 03:32:22 +08:00 · 2024-04-17 21:40:47 -04:00 · 2024-04-17 21:40:47 -04:00 · 7ce2dd976f
commit 7ce2dd976f
parent 460763ba5f ca2bf9cc12
8 changed files with 119 additions and 20 deletions
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -110,6 +110,8 @@ app.post("/v0/scrape", async (req, res) => {
      return res.status(400).json({ error: "Url is required" });
    }

+    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
+
    try {
      const a = new WebScraperDataProvider();
      await a.setOptions({
@ -118,6 +120,7 @@ app.post("/v0/scrape", async (req, res) => {
        crawlerOptions: {
          ...crawlerOptions,
        },
+        pageOptions: pageOptions,
      });

      const docs = await a.getDocuments(false);
@ -178,6 +181,7 @@ app.post("/v0/crawl", async (req, res) => {
    }
    const mode = req.body.mode ?? "crawl";
    const crawlerOptions = req.body.crawlerOptions ?? {};
+    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };

    if (mode === "single_urls" && !url.includes(",")) {
      try {
@ -188,6 +192,7 @@ app.post("/v0/crawl", async (req, res) => {
          crawlerOptions: {
            returnOnlyUrls: true,
          },
+          pageOptions: pageOptions,
        });

        const docs = await a.getDocuments(false, (progress) => {
@ -212,6 +217,8 @@ app.post("/v0/crawl", async (req, res) => {
      mode: mode ?? "crawl", // fix for single urls not working
      crawlerOptions: { ...crawlerOptions },
      team_id: team_id,
+      pageOptions: pageOptions,
+
    });

    res.json({ jobId: job.id });
@ -239,11 +246,13 @@ app.post("/v0/crawlWebsitePreview", async (req, res) => {
    }
    const mode = req.body.mode ?? "crawl";
    const crawlerOptions = req.body.crawlerOptions ?? {};
+    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
    const job = await addWebScraperJob({
      url: url,
      mode: mode ?? "crawl", // fix for single urls not working
      crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
      team_id: "preview",
+      pageOptions: pageOptions,
    });

    res.json({ jobId: job.id });
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -9,6 +9,24 @@ export interface Progress {
  currentDocumentUrl?: string;
 }

+export type PageOptions = {
+  onlyMainContent?: boolean;
+};
+export type WebScraperOptions = {
+  urls: string[];
+  mode: "single_urls" | "sitemap" | "crawl";
+  crawlerOptions?: {
+    returnOnlyUrls?: boolean;
+    includes?: string[];
+    excludes?: string[];
+    maxCrawledLinks?: number;
+    limit?: number;
+    generateImgAltText?: boolean;
+  };
+  pageOptions?: PageOptions;
+  concurrentRequests?: number;
+};
+
 export class Document {
  id?: string;
  content: string;
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@ -13,6 +13,7 @@ export async function startWebScraperPipeline({
    url: job.data.url,
    mode: job.data.mode,
    crawlerOptions: job.data.crawlerOptions,
+    pageOptions: job.data.pageOptions,
    inProgress: (progress) => {
      job.progress(progress);
    },
@ -29,6 +30,7 @@ export async function runWebScraper({
  url,
  mode,
  crawlerOptions,
+  pageOptions,
  inProgress,
  onSuccess,
  onError,
@ -37,6 +39,7 @@ export async function runWebScraper({
  url: string;
  mode: "crawl" | "single_urls" | "sitemap";
  crawlerOptions: any;
+  pageOptions?: any;
  inProgress: (progress: any) => void;
  onSuccess: (result: any) => void;
  onError: (error: any) => void;
@ -44,18 +47,19 @@ export async function runWebScraper({
 }): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> {
  try {
    const provider = new WebScraperDataProvider();
-
    if (mode === "crawl") {
      await provider.setOptions({
        mode: mode,
        urls: [url],
        crawlerOptions: crawlerOptions,
+        pageOptions: pageOptions,
      });
    } else {
      await provider.setOptions({
        mode: mode,
        urls: url.split(","),
        crawlerOptions: crawlerOptions,
+        pageOptions: pageOptions,
      });
    }
    const docs = (await provider.getDocuments(false, (progress: Progress) => {
--- a/apps/api/src/scraper/WebScraper/tests/index.test.ts
+++ b/apps/api/src/scraper/WebScraper/tests/index.test.ts
@ -13,6 +13,10 @@ describe("WebScraperDataProvider", () => {
          metadata: { sourceURL: "https://example.com/another-page" },
          content: "![another alt text](./another-image.png)",
        },
+        {
+          metadata: { sourceURL: "https://example.com/another-page" },
+          content: "![another alt text](./another-image.webp)",
+        },
        {
          metadata: { sourceURL: "https://example.com/data-image" },
          content: "![data image](data:image/png;base64,...)",
@ -28,6 +32,10 @@ describe("WebScraperDataProvider", () => {
          metadata: { sourceURL: "https://example.com/another-page" },
          content: "![another alt text](https://example.com/another-image.png)",
        },
+        {
+          metadata: { sourceURL: "https://example.com/another-page" },
+          content: "![another alt text](https://example.com/another-image.webp)",
+        },
        {
          metadata: { sourceURL: "https://example.com/data-image" },
          content: "![data image](data:image/png;base64,...)",
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -1,4 +1,4 @@
-import { Document } from "../../lib/entities";
+import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
 import { Progress } from "../../lib/entities";
 import { scrapSingleUrl } from "./single_url";
 import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
@ -6,19 +6,7 @@ import { WebCrawler } from "./crawler";
 import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/gptVision";

-export type WebScraperOptions = {
-  urls: string[];
-  mode: "single_urls" | "sitemap" | "crawl";
-  crawlerOptions?: {
-    returnOnlyUrls?: boolean;
-    includes?: string[];
-    excludes?: string[];
-    maxCrawledLinks?: number;
-    limit?: number;
-    generateImgAltText?: boolean;
-  };
-  concurrentRequests?: number;
-};
+
 export class WebScraperDataProvider {
  private urls: string[] = [""];
  private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
@ -29,6 +17,7 @@ export class WebScraperDataProvider {
  private limit: number = 10000;
  private concurrentRequests: number = 20;
  private generateImgAltText: boolean = false;
+  private pageOptions?: PageOptions;

  authorize(): void {
    throw new Error("Method not implemented.");
@ -51,7 +40,7 @@ export class WebScraperDataProvider {
      const batchUrls = urls.slice(i, i + this.concurrentRequests);
      await Promise.all(
        batchUrls.map(async (url, index) => {
-          const result = await scrapSingleUrl(url, true);
+          const result = await scrapSingleUrl(url, true, this.pageOptions);
          processedUrls++;
          if (inProgress) {
            inProgress({
@ -321,6 +310,7 @@ export class WebScraperDataProvider {
    this.limit = options.crawlerOptions?.limit ?? 10000;
    this.generateImgAltText =
      options.crawlerOptions?.generateImgAltText ?? false;
+    this.pageOptions = options.pageOptions ?? {onlyMainContent: false};

    //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
    this.excludes = this.excludes.filter((item) => item !== "");
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -2,9 +2,10 @@ import * as cheerio from "cheerio";
 import { ScrapingBeeClient } from "scrapingbee";
 import { extractMetadata } from "./utils/metadata";
 import dotenv from "dotenv";
-import { Document } from "../../lib/entities";
+import { Document, PageOptions } from "../../lib/entities";
 import { parseMarkdown } from "../../lib/html-to-markdown";
 import { parseTablesToMarkdown } from "./utils/parseTable";
+import { excludeNonMainTags } from "./utils/excludeTags";
 // import puppeteer from "puppeteer";

 dotenv.config();
@ -77,14 +78,21 @@ export async function scrapWithPlaywright(url: string): Promise<string> {

 export async function scrapSingleUrl(
  urlToScrap: string,
-  toMarkdown: boolean = true
+  toMarkdown: boolean = true,
+  pageOptions: PageOptions = { onlyMainContent: true }
 ): Promise<Document> {
  console.log(`Scraping URL: ${urlToScrap}`);
  urlToScrap = urlToScrap.trim();

-  const removeUnwantedElements = (html: string) => {
+  const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
    const soup = cheerio.load(html);
    soup("script, style, iframe, noscript, meta, head").remove();
+    if (pageOptions.onlyMainContent) {
+      // remove any other tags that are not in the main content
+      excludeNonMainTags.forEach((tag) => {
+        soup(tag).remove();
+      });
+    }
    return soup.html();
  };

@ -133,7 +141,7 @@ export async function scrapSingleUrl(
        }
        break;
    }
-    let cleanedHtml = removeUnwantedElements(text);
+    let cleanedHtml = removeUnwantedElements(text, pageOptions);
    return [await parseMarkdown(cleanedHtml), text];
  };

--- a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts
+++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts
@ -0,0 +1,60 @@
+export const excludeNonMainTags = [
+  "header",
+  "footer",
+  "nav",
+  "aside",
+  ".header",
+  ".top",
+  ".navbar",
+  "#header",
+  ".footer",
+  ".bottom",
+  "#footer",
+  ".sidebar",
+  ".side",
+  ".aside",
+  "#sidebar",
+  ".modal",
+  ".popup",
+  "#modal",
+  ".overlay",
+  ".ad",
+  ".ads",
+  ".advert",
+  "#ad",
+  ".lang-selector",
+  ".language",
+  "#language-selector",
+  ".social",
+  ".social-media",
+  ".social-links",
+  "#social",
+  ".menu",
+  ".navigation",
+  "#nav",
+  ".breadcrumbs",
+  "#breadcrumbs",
+  ".form",
+  "form",
+  "#search-form",
+  ".search",
+  "#search",
+  ".share",
+  "#share",
+  ".pagination",
+  "#pagination",
+  ".widget",
+  "#widget",
+  ".related",
+  "#related",
+  ".tag",
+  "#tag",
+  ".category",
+  "#category",
+  ".comment",
+  "#comment",
+  ".reply",
+  "#reply",
+  ".author",
+  "#author",
+];
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -20,7 +20,9 @@ export interface WebScraperOptions {
  url: string;
  mode: "crawl" | "single_urls" | "sitemap";
  crawlerOptions: any;
+  pageOptions: any;
  team_id: string;
 }


+