Merge branch 'main' into test/crawl-options

2024-11-16 03:32:22 +08:00 · 2024-05-15 12:34:47 -07:00 · 2024-05-15 12:34:47 -07:00 · 1601e93d69
commit 1601e93d69
parent 4925ee59f6 3678d3c986
8 changed files with 283 additions and 35 deletions
--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@ -18,8 +18,8 @@
  "paths": {
    "/scrape": {
      "post": {
-        "summary": "Scrape a single URL",
+        "summary": "Scrape a single URL and optionally extract information using an LLM",
-        "operationId": "scrapeSingleUrl",
+        "operationId": "scrapeAndExtractFromUrl",
        "tags": ["Scraping"],
        "security": [
          {
@ -45,8 +45,43 @@
                        "type": "boolean",
                        "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
                        "default": false
                      },
                      "includeHtml": {
                        "type": "boolean",
                        "description": "Include the raw HTML content of the page. Will output a html key in the response.",
                        "default": false
                      }
                    }
                  },
                  "extractorOptions": {
                    "type": "object",
                    "description": "Options for LLM-based extraction of structured information from the page content",
                    "properties": {
                      "mode": {
                        "type": "string",
                        "enum": ["llm-extraction"],
                        "description": "The extraction mode to use, currently supports 'llm-extraction'"
                      },
                      "extractionPrompt": {
                        "type": "string",
                        "description": "A prompt describing what information to extract from the page"
                      },
                      "extractionSchema": {
                        "type": "object",
                        "additionalProperties": true,
                        "description": "The schema for the data to be extracted",
                        "required": [
                          "company_mission",
                          "supports_sso",
                          "is_open_source"
                        ]
                      }
                    }
                  },
                  "timeout": {
                    "type": "integer",
                    "description": "Timeout in milliseconds for the request",
                    "default": 30000
                  }
                },
                "required": ["url"]
@ -126,6 +161,16 @@
                        "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
                        "default": false
                      },
                      "maxDepth": {
                        "type": "integer",
                        "description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
                      },
                      "mode": {
                        "type": "string",
                        "enum": ["default", "fast"],
                        "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
                        "default": "default"
                      },
                      "limit": {
                        "type": "integer",
                        "description": "Maximum number of pages to crawl",
@ -140,6 +185,11 @@
                        "type": "boolean",
                        "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
                        "default": false
                      },
                      "includeHtml": {
                        "type": "boolean",
                        "description": "Include the raw HTML content of the page. Will output a html key in the response.",
                        "default": false
                      }
                    }
                  }
@ -206,6 +256,11 @@
                        "type": "boolean",
                        "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
                        "default": true
                      },
                      "includeHtml": {
                        "type": "boolean",
                        "description": "Include the raw HTML content of the page. Will output a html key in the response.",
                        "default": false
                      }
                    }
                  },
@ -302,6 +357,63 @@
                        "$ref": "#/components/schemas/ScrapeResponse"
                      },
                      "description": "Data returned from the job (null when it is in progress)"
                    },
                    "partial_data": {
                      "type": "array",
                      "items": {
                        "$ref": "#/components/schemas/ScrapeResponse"
                      },
                      "description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
                    }
                  }
                }
              }
            }
          },
          "402": {
            "description": "Payment required"
          },
          "429": {
            "description": "Too many requests"
          },
          "500": {
            "description": "Server error"
          }
        }
      }
    },
    "/crawl/cancel/{jobId}": {
      "delete": {
        "tags": ["Crawl"],
        "summary": "Cancel a crawl job",
        "operationId": "cancelCrawlJob",
        "security": [
          {
            "bearerAuth": []
          }
        ],
        "parameters": [
          {
            "name": "jobId",
            "in": "path",
            "description": "ID of the crawl job",
            "required": true,
            "schema": {
              "type": "string"
            }
          }
        ],
        "responses": {
          "200": {
            "description": "Successful response",
            "content": {
              "application/json": {
                "schema": {
                  "type": "object",
                  "properties": {
                    "status": {
                      "type": "string",
                      "description": "Returns cancelled."
                    }
                  }
                }
@ -344,6 +456,11 @@
              "content": {
                "type": "string"
              },
              "html": {
                "type": "string",
                "nullable": true,
                "description": "Raw HTML content of the page if `includeHtml`  is true"
              },
              "metadata": {
                "type": "object",
                "properties": {
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -660,6 +660,107 @@ describe("E2E Tests for API Routes", () => {
  //   }, 120000); // 120 secs
  // });
  describe("POST /v0/crawl with fast mode", () => {
    it("should complete the crawl under 20 seconds", async () => {
      const startTime = Date.now();
      const crawlResponse = await request(TEST_URL)
        .post("/v0/crawl")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          url: "https://flutterbricks.com",
          crawlerOptions: {
            mode: "fast"
          }
        });
      expect(crawlResponse.statusCode).toBe(200);
      const jobId = crawlResponse.body.jobId;
      let statusResponse;
      let isFinished = false;
      while (!isFinished) {
        statusResponse = await request(TEST_URL)
          .get(`/v0/crawl/status/${jobId}`)
          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
        expect(statusResponse.statusCode).toBe(200);
        isFinished = statusResponse.body.status === "completed";
        if (!isFinished) {
          await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
        }
      }
      const endTime = Date.now();
      const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
      console.log(`Time elapsed: ${timeElapsed} seconds`);
      expect(statusResponse.body.status).toBe("completed");
      expect(statusResponse.body).toHaveProperty("data");
      expect(statusResponse.body.data[0]).toHaveProperty("content");
      expect(statusResponse.body.data[0]).toHaveProperty("markdown");
      const results = statusResponse.body.data;
      // results.forEach((result, i) => {
      //   console.log(result.metadata.sourceURL);
      // });
      expect(results.length).toBeGreaterThanOrEqual(10);
      expect(results.length).toBeLessThanOrEqual(15);
    }, 20000);
    // it("should complete the crawl in more than 10 seconds", async () => {
    //   const startTime = Date.now();
    //   const crawlResponse = await request(TEST_URL)
    //     .post("/v0/crawl")
    //     .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
    //     .set("Content-Type", "application/json")
    //     .send({
    //       url: "https://flutterbricks.com",
    //     });
    //   expect(crawlResponse.statusCode).toBe(200);
    //   const jobId = crawlResponse.body.jobId;
    //   let statusResponse;
    //   let isFinished = false;
    //   while (!isFinished) {
    //     statusResponse = await request(TEST_URL)
    //       .get(`/v0/crawl/status/${jobId}`)
    //       .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
    //     expect(statusResponse.statusCode).toBe(200);
    //     isFinished = statusResponse.body.status === "completed";
    //     if (!isFinished) {
    //       await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
    //     }
    //   }
    //   const endTime = Date.now();
    //   const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
    //   console.log(`Time elapsed: ${timeElapsed} seconds`);
    //   expect(statusResponse.body.status).toBe("completed");
    //   expect(statusResponse.body).toHaveProperty("data");
    //   expect(statusResponse.body.data[0]).toHaveProperty("content");
    //   expect(statusResponse.body.data[0]).toHaveProperty("markdown");
    //   const results = statusResponse.body.data;
    //   // results.forEach((result, i) => {
    //   //   console.log(result.metadata.sourceURL);
    //   // });
    //   expect(results.length).toBeGreaterThanOrEqual(10);
    //   expect(results.length).toBeLessThanOrEqual(15);
    // }, 50000);// 15 seconds timeout to account for network delays
  });
  describe("GET /is-production", () => {
    it("should return the production status", async () => {
      const response = await request(TEST_URL).get("/is-production");
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -44,6 +44,7 @@ export type WebScraperOptions = {
    limit?: number;
    generateImgAltText?: boolean;
    replaceAllPathsWithAbsolutePaths?: boolean;
    mode?: "default" | "fast"; // have a mode of some sort
  };
  pageOptions?: PageOptions;
  extractorOptions?: ExtractorOptions;
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -4,7 +4,7 @@ import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
 import { Progress } from "../../lib/entities";
-import { scrapWithScrapingBee } from "./single_url";
+import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
 export class WebCrawler {
@ -15,7 +15,7 @@ export class WebCrawler {
  private maxCrawledLinks: number;
  private maxCrawledDepth: number;
  private visited: Set<string> = new Set();
-  private crawledUrls: Set<string> = new Set();
+  private crawledUrls: Map<string, string> = new Map();
  private limit: number;
  private robotsTxtUrl: string;
  private robots: any;
@ -51,7 +51,6 @@ export class WebCrawler {
    this.generateImgAltText = generateImgAltText ?? false;
  }
  private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
    return sitemapLinks
      .filter((link) => {
@ -99,7 +98,7 @@ export class WebCrawler {
    concurrencyLimit: number = 5,
    limit: number = 10000,
    maxDepth: number = 10
-  ): Promise<string[]> {
+  ): Promise<{ url: string, html: string }[]> {
    // Fetch and parse robots.txt
    try {
      const response = await axios.get(this.robotsTxtUrl);
@ -111,7 +110,7 @@ export class WebCrawler {
    const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
    if (sitemapLinks.length > 0) {
      const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
-      return filteredLinks;
+      return filteredLinks.map(link => ({ url: link, html: "" }));
    }
    const urls = await this.crawlUrls(
@ -123,18 +122,19 @@ export class WebCrawler {
      urls.length === 0 &&
      this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
    ) {
-      return [this.initialUrl];
+      return [{ url: this.initialUrl, html: "" }];
    }
    // make sure to run include exclude here again
-    return this.filterLinks(urls, limit, this.maxCrawledDepth);
+    const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
    return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
  }
  private async crawlUrls(
    urls: string[],
    concurrencyLimit: number,
    inProgress?: (progress: Progress) => void
-  ): Promise<string[]> {
+  ): Promise<{ url: string, html: string }[]> {
    const queue = async.queue(async (task: string, callback) => {
      if (this.crawledUrls.size >= this.maxCrawledLinks) {
        if (callback && typeof callback === "function") {
@ -143,13 +143,13 @@ export class WebCrawler {
        return;
      }
      const newUrls = await this.crawl(task);
-      newUrls.forEach((url) => this.crawledUrls.add(url));
+      newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
      if (inProgress && newUrls.length > 0) {
        inProgress({
          current: this.crawledUrls.size,
          total: this.maxCrawledLinks,
          status: "SCRAPING",
-          currentDocumentUrl: newUrls[newUrls.length - 1],
+          currentDocumentUrl: newUrls[newUrls.length - 1].url,
        });
      } else if (inProgress) {
        inProgress({
@ -159,7 +159,7 @@ export class WebCrawler {
          currentDocumentUrl: task,
        });
      }
-      await this.crawlUrls(newUrls, concurrencyLimit, inProgress);
+      await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
      if (callback && typeof callback === "function") {
        callback();
      }
@ -175,10 +175,10 @@ export class WebCrawler {
      }
    );
    await queue.drain();
-    return Array.from(this.crawledUrls);
+    return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
  }
-  async crawl(url: string): Promise<string[]> {
+  async crawl(url: string): Promise<{url: string, html: string}[]> {
    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent"))
      return [];
    this.visited.add(url);
@ -193,16 +193,17 @@ export class WebCrawler {
    }
    try {
-      let content;
+      let content : string = "";
-      // If it is the first link, fetch with scrapingbee
+      // If it is the first link, fetch with single url
      if (this.visited.size === 1) {
-        content = await scrapWithScrapingBee(url, "load");
+        const page = await scrapSingleUrl(url, {includeHtml: true});
        content = page.html ?? ""
      } else {
        const response = await axios.get(url);
-        content = response.data;
+        content = response.data ?? "";
      }
      const $ = load(content);
-      let links: string[] = [];
+      let links: {url: string, html: string}[] = [];
      $("a").each((_, element) => {
        const href = $(element).attr("href");
@ -215,7 +216,6 @@ export class WebCrawler {
          const path = url.pathname;
          if (
            // fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url
            this.isInternalLink(fullUrl) &&
            this.matchesPattern(fullUrl) &&
            this.noSections(fullUrl) &&
@ -223,12 +223,13 @@ export class WebCrawler {
            !this.matchesExcludes(path) &&
            this.robots.isAllowed(fullUrl, "FireCrawlAgent")
          ) {
-            links.push(fullUrl);
+            links.push({url: fullUrl, html: content});
          }
        }
      });
-      return links.filter((link) => !this.visited.has(link));
+      // Create a new list to return to avoid modifying the visited list
      return links.filter((link) => !this.visited.has(link.url));
    } catch (error) {
      return [];
    }
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -35,6 +35,7 @@ export class WebScraperDataProvider {
  private replaceAllPathsWithAbsolutePaths?: boolean = false;
  private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
    "gpt-4-turbo";
  private crawlerMode: string = "default";
  authorize(): void {
    throw new Error("Method not implemented.");
@ -46,7 +47,8 @@ export class WebScraperDataProvider {
  private async convertUrlsToDocuments(
    urls: string[],
-    inProgress?: (progress: Progress) => void
+    inProgress?: (progress: Progress) => void,
    allHtmls?: string[]
  ): Promise<Document[]> {
    const totalUrls = urls.length;
    let processedUrls = 0;
@ -56,7 +58,8 @@ export class WebScraperDataProvider {
      const batchUrls = urls.slice(i, i + this.concurrentRequests);
      await Promise.all(
        batchUrls.map(async (url, index) => {
-          const result = await scrapSingleUrl(url, this.pageOptions);
+          const existingHTML = allHtmls ? allHtmls[i + index] : "";
          const result = await scrapSingleUrl(url, this.pageOptions, existingHTML);
          processedUrls++;
          if (inProgress) {
            inProgress({
@ -139,13 +142,26 @@ export class WebScraperDataProvider {
      limit: this.limit,
      generateImgAltText: this.generateImgAltText,
    });
    let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
    const allLinks = links.map((e) => e.url);
    const allHtmls = links.map((e)=> e.html);
    if (this.returnOnlyUrls) {
-      return this.returnOnlyUrlsResponse(links, inProgress);
+      return this.returnOnlyUrlsResponse(allLinks , inProgress);
    }
    let documents = [];
    // check if fast mode is enabled and there is html inside the links
    if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
      console.log("Fast mode enabled");
      documents = await this.processLinks(allLinks, inProgress, allHtmls);
    }else{
      documents = await this.processLinks(allLinks, inProgress);
    }
-    let documents = await this.processLinks(links, inProgress);
+    return this.cacheAndFinalizeDocuments(documents, allLinks);
    return this.cacheAndFinalizeDocuments(documents, links);
  }
  private async handleSingleUrlsMode(
@ -187,14 +203,17 @@ export class WebScraperDataProvider {
  private async processLinks(
    links: string[],
-    inProgress?: (progress: Progress) => void
+    inProgress?: (progress: Progress) => void,
    allHtmls?: string[]
  ): Promise<Document[]> {
    let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
    let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
    links = links.filter((link) => !link.endsWith(".pdf"));
-
+    
-    let documents = await this.convertUrlsToDocuments(links, inProgress);
+    let documents = await this.convertUrlsToDocuments(links, inProgress, allHtmls);
    documents = await this.getSitemapData(this.urls[0], documents);
    documents = this.applyPathReplacements(documents);
    // documents = await this.applyImgAltText(documents);
@ -397,6 +416,7 @@ export class WebScraperDataProvider {
    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
    //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
    this.excludes = this.excludes.filter((item) => item !== "");
    this.crawlerMode = options.crawlerOptions?.mode ?? "default";
    // make sure all urls start with https://
    this.urls = this.urls.map((url) => {
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -106,7 +106,8 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
 export async function scrapSingleUrl(
  urlToScrap: string,
-  pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
+  pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
  existingHtml: string = ""
 ): Promise<Document> {
  urlToScrap = urlToScrap.trim();
@ -197,8 +198,15 @@ export async function scrapSingleUrl(
      : ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
    for (const scraper of scrapersInOrder) {
      // If exists text coming from crawler, use it
      if (existingHtml && existingHtml.trim().length >= 100) {
        let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
        text = await parseMarkdown(cleanedHtml);
        html = existingHtml;
        break;
      }
      [text, html] = await attemptScraping(urlToScrap, scraper);
-      if (text && text.length >= 100) break;
+      if (text && text.trim().length >= 100) break;
      console.log(`Falling back to ${scraper}`);
    }
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -26,7 +26,7 @@ getWebScraperQueue().process(
        success: success,
        result: {
          links: docs.map((doc) => {
-            return { content: doc, source: doc.metadata.sourceURL };
+            return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
          }),
        },
        project_id: job.data.project_id,
--- a/apps/test-suite/tests/scrape.test.ts
+++ b/apps/test-suite/tests/scrape.test.ts
@ -176,7 +176,7 @@ describe("Scraping Checkup (E2E)", () => {
      }
-      expect(score).toBeGreaterThanOrEqual(75);
+      expect(score).toBeGreaterThanOrEqual(70);
    }, 350000); // 150 seconds timeout
  });
 });