Nick:

2024-11-16 03:32:22 +08:00 · 2024-06-10 16:27:10 -07:00 · 2024-06-10 16:27:10 -07:00 · 3091f0134c
commit 3091f0134c
parent aafd23fa8a
3 changed files with 13 additions and 5 deletions
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
-import { Progress } from "../../lib/entities";
+import { PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";

@ -108,6 +108,7 @@ export class WebCrawler {

  public async start(
    inProgress?: (progress: Progress) => void,
+    pageOptions?: PageOptions,
    concurrencyLimit: number = 5,
    limit: number = 10000,
    maxDepth: number = 10
@ -130,6 +131,7 @@ export class WebCrawler {

    const urls = await this.crawlUrls(
      [this.initialUrl],
+      pageOptions,
      concurrencyLimit,
      inProgress
    );
@ -148,6 +150,7 @@ export class WebCrawler {

  private async crawlUrls(
    urls: string[],
+    pageOptions: PageOptions,
    concurrencyLimit: number,
    inProgress?: (progress: Progress) => void,
  ): Promise<{ url: string, html: string }[]> {
@ -158,7 +161,7 @@ export class WebCrawler {
        }
        return;
      }
-      const newUrls = await this.crawl(task);
+      const newUrls = await this.crawl(task, pageOptions);
      // add the initial url if not already added
      // if (this.visited.size === 1) {
      //   let normalizedInitial = this.initialUrl;
@ -188,7 +191,7 @@ export class WebCrawler {
          currentDocumentUrl: task,
        });
      }
-      await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
+      await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
      if (callback && typeof callback === "function") {
        callback();
      }
@ -207,7 +210,7 @@ export class WebCrawler {
    return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
  }

-  async crawl(url: string): Promise<{url: string, html: string}[]> {
+  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
      return [];
    }
@ -231,7 +234,8 @@ export class WebCrawler {
      let content : string = "";
      // If it is the first link, fetch with single url
      if (this.visited.size === 1) {
-        const page = await scrapSingleUrl(url, {includeHtml: true});
+        console.log(pageOptions)
+        const page = await scrapSingleUrl(url, {...pageOptions, includeHtml: true});
        content = page.html ?? ""
      } else {
        const response = await axios.get(url);
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -173,6 +173,7 @@ export class WebScraperDataProvider {

    let links = await crawler.start(
      inProgress,
+      this.pageOptions,
      5,
      this.limit,
      this.maxCrawledDepth
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@ -12,6 +12,8 @@ export async function getLinksFromSitemap(
      content = response.data;
    } catch (error) {
      console.error(`Request failed for ${sitemapUrl}: ${error}`);
+  console.log(allUrls)
+
      return allUrls;
    }

@ -34,6 +36,7 @@ export async function getLinksFromSitemap(
  } catch (error) {
    console.error(`Error processing ${sitemapUrl}: ${error}`);
  }
+  console.log(allUrls)

  return allUrls;
 }