Merge pull request #14 from mendableai/nsc/clean-content

Option to extract only the main content, excluding headers, navs, footers etc.
2024-11-16 03:32:22 +08:00 · 2024-04-17 21:40:47 -04:00 · 2024-04-17 21:40:47 -04:00 · 7ce2dd976f
commit 7ce2dd976f
parent 460763ba5f ca2bf9cc12
8 changed files with 119 additions and 20 deletions
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -110,6 +110,8 @@ app.post("/v0/scrape", async (req, res) => {
      return res.status(400).json({ error: "Url is required" });
    }
    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
    try {
      const a = new WebScraperDataProvider();
      await a.setOptions({
@ -118,6 +120,7 @@ app.post("/v0/scrape", async (req, res) => {
        crawlerOptions: {
          ...crawlerOptions,
        },
        pageOptions: pageOptions,
      });
      const docs = await a.getDocuments(false);
@ -178,6 +181,7 @@ app.post("/v0/crawl", async (req, res) => {
    }
    const mode = req.body.mode ?? "crawl";
    const crawlerOptions = req.body.crawlerOptions ?? {};
    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
    if (mode === "single_urls" && !url.includes(",")) {
      try {
@ -188,6 +192,7 @@ app.post("/v0/crawl", async (req, res) => {
          crawlerOptions: {
            returnOnlyUrls: true,
          },
          pageOptions: pageOptions,
        });
        const docs = await a.getDocuments(false, (progress) => {
@ -212,6 +217,8 @@ app.post("/v0/crawl", async (req, res) => {
      mode: mode ?? "crawl", // fix for single urls not working
      crawlerOptions: { ...crawlerOptions },
      team_id: team_id,
      pageOptions: pageOptions,
    });
    res.json({ jobId: job.id });
@ -239,11 +246,13 @@ app.post("/v0/crawlWebsitePreview", async (req, res) => {
    }
    const mode = req.body.mode ?? "crawl";
    const crawlerOptions = req.body.crawlerOptions ?? {};
    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
    const job = await addWebScraperJob({
      url: url,
      mode: mode ?? "crawl", // fix for single urls not working
      crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 },
      team_id: "preview",
      pageOptions: pageOptions,
    });
    res.json({ jobId: job.id });
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -9,6 +9,24 @@ export interface Progress {
  currentDocumentUrl?: string;
 }
 export type PageOptions = {
  onlyMainContent?: boolean;
 };
 export type WebScraperOptions = {
  urls: string[];
  mode: "single_urls" | "sitemap" | "crawl";
  crawlerOptions?: {
    returnOnlyUrls?: boolean;
    includes?: string[];
    excludes?: string[];
    maxCrawledLinks?: number;
    limit?: number;
    generateImgAltText?: boolean;
  };
  pageOptions?: PageOptions;
  concurrentRequests?: number;
 };
 export class Document {
  id?: string;
  content: string;
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@ -13,6 +13,7 @@ export async function startWebScraperPipeline({
    url: job.data.url,
    mode: job.data.mode,
    crawlerOptions: job.data.crawlerOptions,
    pageOptions: job.data.pageOptions,
    inProgress: (progress) => {
      job.progress(progress);
    },
@ -29,6 +30,7 @@ export async function runWebScraper({
  url,
  mode,
  crawlerOptions,
  pageOptions,
  inProgress,
  onSuccess,
  onError,
@ -37,6 +39,7 @@ export async function runWebScraper({
  url: string;
  mode: "crawl" | "single_urls" | "sitemap";
  crawlerOptions: any;
  pageOptions?: any;
  inProgress: (progress: any) => void;
  onSuccess: (result: any) => void;
  onError: (error: any) => void;
@ -44,18 +47,19 @@ export async function runWebScraper({
 }): Promise<{ success: boolean; message: string; docs: CrawlResult[] }> {
  try {
    const provider = new WebScraperDataProvider();
    if (mode === "crawl") {
      await provider.setOptions({
        mode: mode,
        urls: [url],
        crawlerOptions: crawlerOptions,
        pageOptions: pageOptions,
      });
    } else {
      await provider.setOptions({
        mode: mode,
        urls: url.split(","),
        crawlerOptions: crawlerOptions,
        pageOptions: pageOptions,
      });
    }
    const docs = (await provider.getDocuments(false, (progress: Progress) => {
--- a/apps/api/src/scraper/WebScraper/tests/index.test.ts
+++ b/apps/api/src/scraper/WebScraper/tests/index.test.ts
@ -13,6 +13,10 @@ describe("WebScraperDataProvider", () => {
          metadata: { sourceURL: "https://example.com/another-page" },
          content: "![another alt text](./another-image.png)",
        },
        {
          metadata: { sourceURL: "https://example.com/another-page" },
          content: "![another alt text](./another-image.webp)",
        },
        {
          metadata: { sourceURL: "https://example.com/data-image" },
          content: "![data image](data:image/png;base64,...)",
@ -28,6 +32,10 @@ describe("WebScraperDataProvider", () => {
          metadata: { sourceURL: "https://example.com/another-page" },
          content: "![another alt text](https://example.com/another-image.png)",
        },
        {
          metadata: { sourceURL: "https://example.com/another-page" },
          content: "![another alt text](https://example.com/another-image.webp)",
        },
        {
          metadata: { sourceURL: "https://example.com/data-image" },
          content: "![data image](data:image/png;base64,...)",
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -1,4 +1,4 @@
-import { Document } from "../../lib/entities";
+import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
 import { Progress } from "../../lib/entities";
 import { scrapSingleUrl } from "./single_url";
 import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
@ -6,19 +6,7 @@ import { WebCrawler } from "./crawler";
 import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/gptVision";
-export type WebScraperOptions = {
+
  urls: string[];
  mode: "single_urls" | "sitemap" | "crawl";
  crawlerOptions?: {
    returnOnlyUrls?: boolean;
    includes?: string[];
    excludes?: string[];
    maxCrawledLinks?: number;
    limit?: number;
    generateImgAltText?: boolean;
  };
  concurrentRequests?: number;
 };
 export class WebScraperDataProvider {
  private urls: string[] = [""];
  private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
@ -29,6 +17,7 @@ export class WebScraperDataProvider {
  private limit: number = 10000;
  private concurrentRequests: number = 20;
  private generateImgAltText: boolean = false;
  private pageOptions?: PageOptions;
  authorize(): void {
    throw new Error("Method not implemented.");
@ -51,7 +40,7 @@ export class WebScraperDataProvider {
      const batchUrls = urls.slice(i, i + this.concurrentRequests);
      await Promise.all(
        batchUrls.map(async (url, index) => {
-          const result = await scrapSingleUrl(url, true);
+          const result = await scrapSingleUrl(url, true, this.pageOptions);
          processedUrls++;
          if (inProgress) {
            inProgress({
@ -321,6 +310,7 @@ export class WebScraperDataProvider {
    this.limit = options.crawlerOptions?.limit ?? 10000;
    this.generateImgAltText =
      options.crawlerOptions?.generateImgAltText ?? false;
    this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
    //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
    this.excludes = this.excludes.filter((item) => item !== "");
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -2,9 +2,10 @@ import * as cheerio from "cheerio";
 import { ScrapingBeeClient } from "scrapingbee";
 import { extractMetadata } from "./utils/metadata";
 import dotenv from "dotenv";
-import { Document } from "../../lib/entities";
+import { Document, PageOptions } from "../../lib/entities";
 import { parseMarkdown } from "../../lib/html-to-markdown";
 import { parseTablesToMarkdown } from "./utils/parseTable";
 import { excludeNonMainTags } from "./utils/excludeTags";
 // import puppeteer from "puppeteer";
 dotenv.config();
@ -77,14 +78,21 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
 export async function scrapSingleUrl(
  urlToScrap: string,
-  toMarkdown: boolean = true
+  toMarkdown: boolean = true,
  pageOptions: PageOptions = { onlyMainContent: true }
 ): Promise<Document> {
  console.log(`Scraping URL: ${urlToScrap}`);
  urlToScrap = urlToScrap.trim();
-  const removeUnwantedElements = (html: string) => {
+  const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
    const soup = cheerio.load(html);
    soup("script, style, iframe, noscript, meta, head").remove();
    if (pageOptions.onlyMainContent) {
      // remove any other tags that are not in the main content
      excludeNonMainTags.forEach((tag) => {
        soup(tag).remove();
      });
    }
    return soup.html();
  };
@ -133,7 +141,7 @@ export async function scrapSingleUrl(
        }
        break;
    }
-    let cleanedHtml = removeUnwantedElements(text);
+    let cleanedHtml = removeUnwantedElements(text, pageOptions);
    return [await parseMarkdown(cleanedHtml), text];
  };
--- a/apps/api/src/scraper/WebScraper/utils/excludeTags.ts
+++ b/apps/api/src/scraper/WebScraper/utils/excludeTags.ts
@ -0,0 +1,60 @@
 export const excludeNonMainTags = [
  "header",
  "footer",
  "nav",
  "aside",
  ".header",
  ".top",
  ".navbar",
  "#header",
  ".footer",
  ".bottom",
  "#footer",
  ".sidebar",
  ".side",
  ".aside",
  "#sidebar",
  ".modal",
  ".popup",
  "#modal",
  ".overlay",
  ".ad",
  ".ads",
  ".advert",
  "#ad",
  ".lang-selector",
  ".language",
  "#language-selector",
  ".social",
  ".social-media",
  ".social-links",
  "#social",
  ".menu",
  ".navigation",
  "#nav",
  ".breadcrumbs",
  "#breadcrumbs",
  ".form",
  "form",
  "#search-form",
  ".search",
  "#search",
  ".share",
  "#share",
  ".pagination",
  "#pagination",
  ".widget",
  "#widget",
  ".related",
  "#related",
  ".tag",
  "#tag",
  ".category",
  "#category",
  ".comment",
  "#comment",
  ".reply",
  "#reply",
  ".author",
  "#author",
 ];
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -20,7 +20,9 @@ export interface WebScraperOptions {
  url: string;
  mode: "crawl" | "single_urls" | "sitemap";
  crawlerOptions: any;
  pageOptions: any;
  team_id: string;
 }