Merge branch 'main' into py-sdk-improve-response-handling

2024-11-16 03:32:22 +08:00 · 2024-06-14 10:42:51 -03:00 · 2024-06-14 10:42:51 -03:00 · 5a5c532bea
commit 5a5c532bea
parent 6fd9ce1c89 cc2e3f05b0
78 changed files with 666 additions and 260 deletions
--- a/.github/workflows/js-sdk.yml
+++ b/.github/workflows/js-sdk.yml
--- a/.github/workflows/publish-js-sdk.yml
+++ b/.github/workflows/publish-js-sdk.yml
--- a/.github/workflows/publish-python-sdk.yml
+++ b/.github/workflows/publish-python-sdk.yml
--- a/.github/workflows/python-sdk.yml
+++ b/.github/workflows/python-sdk.yml
--- a/.github/workflows/clean-before-24h-complete-jobs.yml
+++ b/.github/workflows/clean-before-24h-complete-jobs.yml
@ -0,0 +1,20 @@
+name: Clean Before 24h Completed Jobs
+on:
+  schedule:
+    - cron: '0 0 * * *'
+
+env:
+  BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
+    
+jobs:
+  clean-jobs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Send GET request to clean jobs
+        run: |
+          response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs)
+          if [ "$response" -ne 200 ]; then
+            echo "Failed to clean jobs. Response: $response"
+            exit 1
+          fi
+          echo "Successfully cleaned jobs. Response: $response"
--- a/.github/workflows/fly.yml
+++ b/.github/workflows/fly.yml
@ -183,6 +183,7 @@ jobs:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}

  build-and-publish-python-sdk:
+    name: Build and publish Python SDK
    runs-on: ubuntu-latest
    needs: deploy

@ -213,7 +214,7 @@ jobs:
        working-directory: ./apps/python-sdk

      - name: Publish to PyPI
-        if: ${{ env.VERSION_INCREMENTED == 'true' }}
+        if: ${{ env.PYTHON_SDK_VERSION_INCREMENTED == 'true' }}
        env:
          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
@ -222,6 +223,7 @@ jobs:
        working-directory: ./apps/python-sdk

  build-and-publish-js-sdk:
+    name: Build and publish JavaScript SDK
    runs-on: ubuntu-latest
    needs: deploy

--- a/apps/api/fly.toml
+++ b/apps/api/fly.toml
@ -54,7 +54,7 @@ kill_timeout = '5s'
    soft_limit = 12

 [[vm]]
-  size = 'performance-8x'
+  size = 'performance-4x'
  processes = ['app']


--- a/apps/api/openapi.json
+++ b/apps/api/openapi.json
@ -51,10 +51,26 @@
                        "description": "Include the raw HTML content of the page. Will output a html key in the response.",
                        "default": false
                      },
+                      "screenshot": {
+                        "type": "boolean",
+                        "description": "Include a screenshot of the top of the page that you are scraping.",
+                        "default": false
+                      },
                      "waitFor": {
                        "type": "integer",
                        "description": "Wait x amount of milliseconds for the page to load to fetch content",
                        "default": 0
+                      },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
+                      "headers": {
+                        "type": "object",
+                        "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
                      }
                    }
                  },
@ -176,10 +192,20 @@
                        "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
                        "default": "default"
                      },
+                      "ignoreSitemap": {
+                        "type": "boolean",
+                        "description": "Ignore the website sitemap when crawling",
+                        "default": false
+                      },
                      "limit": {
                        "type": "integer",
                        "description": "Maximum number of pages to crawl",
                        "default": 10000
+                      },
+                      "allowBackwardCrawling": {
+                        "type": "boolean",
+                        "description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
+                        "default": false
                      }
                    }
                  },
@ -195,6 +221,27 @@
                        "type": "boolean",
                        "description": "Include the raw HTML content of the page. Will output a html key in the response.",
                        "default": false
+                      },
+                      "screenshot": {
+                        "type": "boolean",
+                        "description": "Include a screenshot of the top of the page that you are scraping.",
+                        "default": false
+                      },
+                      "headers": {
+                        "type": "object",
+                        "description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
+                      },
+                      "removeTags": {
+                        "type": "array",
+                        "items": {
+                          "type": "string"
+                        },
+                        "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
+                      },
+                      "replaceAllPathsWithAbsolutePaths": {
+                        "type": "boolean",
+                        "description": "Replace all relative paths with absolute paths for images and links",
+                        "default": false
                      }
                    }
                  }
@ -368,7 +415,7 @@
                      "items": {
                        "$ref": "#/components/schemas/CrawlStatusResponseObj"
                      },
-                      "description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
+                      "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
                    }
                  }
                }
@ -513,6 +560,10 @@
            "nullable": true,
            "description": "Raw HTML content of the page if `includeHtml`  is true"
          },
+          "index": {
+            "type": "integer",
+            "description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from." 
+          },
          "metadata": {
            "type": "object",
            "properties": {
--- a/apps/api/src/tests/e2e_noAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_noAuth/index.test.ts
@ -1,5 +1,4 @@
 import request from "supertest";
-import { app } from "../../index";
 import dotenv from "dotenv";
 const fs = require("fs");
 const path = require("path");
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -1,5 +1,4 @@
 import request from "supertest";
-import { app } from "../../index";
 import dotenv from "dotenv";
 import { v4 as uuidv4 } from "uuid";

@ -35,7 +34,7 @@ describe("E2E Tests for API Routes", () => {

  describe("POST /v0/scrape", () => {
    it.concurrent("should require authorization", async () => {
-      const response = await request(app).post("/v0/scrape");
+      const response = await request(TEST_URL).post("/v0/scrape");
      expect(response.statusCode).toBe(401);
    });

@ -136,6 +135,40 @@ describe("E2E Tests for API Routes", () => {
      expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
    }, 60000); // 60 seconds

+    it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
+      const responseWithoutRemoveTags = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://www.scrapethissite.com/" });
+      expect(responseWithoutRemoveTags.statusCode).toBe(200);
+      expect(responseWithoutRemoveTags.body).toHaveProperty("data");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
+      expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
+      expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
+      expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
+      expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
+      expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
+      expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
+
+      const response = await request(TEST_URL)
+        .post("/v0/scrape")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("content");
+      expect(response.body.data).toHaveProperty("markdown");
+      expect(response.body.data).toHaveProperty("metadata");
+      expect(response.body.data).not.toHaveProperty("html");
+      expect(response.body.data.content).toContain("Scrape This Site");
+      expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
+      expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
+      expect(response.body.data.content).not.toContain("web scraping"); // strong
+    }, 30000); // 30 seconds timeout
+
    // TODO: add this test back once we nail the waitFor option to be more deterministic
    // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
    //   const startTime = Date.now();
@ -596,7 +629,7 @@ describe("E2E Tests for API Routes", () => {
        .post("/v0/crawl")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
-        .send({ url: "https://roastmywebsite.ai" });
+        .send({ url: "https://mendable.ai/blog" });
      expect(crawlResponse.statusCode).toBe(200);

      let isCompleted = false;
@ -622,7 +655,13 @@ describe("E2E Tests for API Routes", () => {
      expect(completedResponse.body.data[0]).toHaveProperty("content");
      expect(completedResponse.body.data[0]).toHaveProperty("markdown");
      expect(completedResponse.body.data[0]).toHaveProperty("metadata");
-      expect(completedResponse.body.data[0].content).toContain("_Roast_");
+      expect(completedResponse.body.data[0].content).toContain("Mendable");
+
+      const childrenLinks = completedResponse.body.data.filter(doc => 
+        doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
+      );
+
+      expect(childrenLinks.length).toBe(completedResponse.body.data.length);
    }, 120000); // 120 seconds
    
    it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
@ -757,34 +796,82 @@ describe("E2E Tests for API Routes", () => {
    }, 60000);
  }); // 60 seconds

+  it.concurrent("should return a successful response for a valid crawl job with allowBackwardCrawling set to true option", async () => {
+    const crawlResponse = await request(TEST_URL)
+      .post("/v0/crawl")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({
+        url: "https://mendable.ai/blog",
+        pageOptions: { includeHtml: true },
+        crawlerOptions: { allowBackwardCrawling: true },
+      });
+    expect(crawlResponse.statusCode).toBe(200);
+    
+    let isFinished = false;
+    let completedResponse;
+
+    while (!isFinished) {
+      const response = await request(TEST_URL)
+        .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("status");
+
+      if (response.body.status === "completed") {
+        isFinished = true;
+        completedResponse = response;
+      } else {
+        await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
+      }
+    }
+
+    expect(completedResponse.statusCode).toBe(200);
+    expect(completedResponse.body).toHaveProperty("status");
+    expect(completedResponse.body.status).toBe("completed");
+    expect(completedResponse.body).toHaveProperty("data");
+    expect(completedResponse.body.data[0]).toHaveProperty("content");
+    expect(completedResponse.body.data[0]).toHaveProperty("markdown");
+    expect(completedResponse.body.data[0]).toHaveProperty("metadata");
+    expect(completedResponse.body.data[0]).toHaveProperty("html");
+    expect(completedResponse.body.data[0].content).toContain("Mendable");
+    expect(completedResponse.body.data[0].markdown).toContain("Mendable");
+
+    const onlyChildrenLinks = completedResponse.body.data.filter(doc => {
+      return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
+    });
+
+    expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length);
+  }, 60000);
+
  it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
    const crawlResponse = await request(TEST_URL)
      .post("/v0/crawl")
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
      .set("Content-Type", "application/json")
      .send({ url: "https://jestjs.io" });
+
    expect(crawlResponse.statusCode).toBe(200);

-    // wait for 30 seconds
    await new Promise((r) => setTimeout(r, 20000));

-    const response = await request(TEST_URL)
+    const responseCancel = await request(TEST_URL)
      .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
-    expect(response.statusCode).toBe(200);
-    expect(response.body).toHaveProperty("status");
-    expect(response.body.status).toBe("cancelled");
+    expect(responseCancel.statusCode).toBe(200);
+    expect(responseCancel.body).toHaveProperty("status");
+    expect(responseCancel.body.status).toBe("cancelled");

    await new Promise((r) => setTimeout(r, 10000));
-
    const completedResponse = await request(TEST_URL)
      .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
+
    expect(completedResponse.statusCode).toBe(200);
    expect(completedResponse.body).toHaveProperty("status");
    expect(completedResponse.body.status).toBe("failed");
    expect(completedResponse.body).toHaveProperty("data");
-    expect(completedResponse.body.data).toEqual(null);
+    expect(completedResponse.body.data).toBeNull();
    expect(completedResponse.body).toHaveProperty("partial_data");
    expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
    expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
--- a/apps/api/src/controllers/auth.ts
+++ b/apps/api/src/controllers/auth.ts
@ -143,7 +143,7 @@ export async function supaAuthenticateUser(
    const startDate = new Date();
    const endDate = new Date();
    endDate.setDate(endDate.getDate() + 7);
-    await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
+    // await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
    return {
      success: false,
      error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
--- a/apps/api/src/controllers/crawl.ts
+++ b/apps/api/src/controllers/crawl.ts
@ -55,8 +55,14 @@ export async function crawlController(req: Request, res: Response) {
    }

    const mode = req.body.mode ?? "crawl";
-    const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    const crawlerOptions = req.body.crawlerOptions ?? {
+      allowBackwardCrawling: false
+    };
+    const pageOptions = req.body.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      removeTags: []
+    };

    if (mode === "single_urls" && !url.includes(",")) {
      try {
@ -64,9 +70,7 @@ export async function crawlController(req: Request, res: Response) {
        await a.setOptions({
          mode: "single_urls",
          urls: [url],
-          crawlerOptions: {
-            returnOnlyUrls: true,
-          },
+          crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
          pageOptions: pageOptions,
        });

@ -91,7 +95,7 @@ export async function crawlController(req: Request, res: Response) {
    const job = await addWebScraperJob({
      url: url,
      mode: mode ?? "crawl", // fix for single urls not working
-      crawlerOptions: { ...crawlerOptions },
+      crawlerOptions: crawlerOptions,
      team_id: team_id,
      pageOptions: pageOptions,
      origin: req.body.origin ?? "api",
--- a/apps/api/src/controllers/crawlPreview.ts
+++ b/apps/api/src/controllers/crawlPreview.ts
@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {

    const mode = req.body.mode ?? "crawl";
    const crawlerOptions = req.body.crawlerOptions ?? {};
-    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };

    const job = await addWebScraperJob({
      url: url,
--- a/apps/api/src/controllers/search.ts
+++ b/apps/api/src/controllers/search.ts
@ -85,6 +85,7 @@ export async function searchHelper(
      onlyMainContent: pageOptions?.onlyMainContent ?? true,
      fetchPageContent: pageOptions?.fetchPageContent ?? true,
      includeHtml: pageOptions?.includeHtml ?? false,
+      removeTags: pageOptions?.removeTags ?? [],
      fallback: false,
    },
  });
@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
      includeHtml: false,
      onlyMainContent: true,
      fetchPageContent: true,
+      removeTags: [],
      fallback: false,
    };
    const origin = req.body.origin ?? "api";
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -5,13 +5,32 @@ import "dotenv/config";
 import { getWebScraperQueue } from "./services/queue-service";
 import { redisClient } from "./services/rate-limiter";
 import { v0Router } from "./routes/v0";
-import { initSDK } from '@hyperdx/node-opentelemetry';
+import { initSDK } from "@hyperdx/node-opentelemetry";
+import cluster from "cluster";
+import os from "os";

 const { createBullBoard } = require("@bull-board/api");
 const { BullAdapter } = require("@bull-board/api/bullAdapter");
 const { ExpressAdapter } = require("@bull-board/express");

-export const app = express();
+const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
+console.log(`Number of CPUs: ${numCPUs} available`);
+
+if (cluster.isMaster) {
+  console.log(`Master ${process.pid} is running`);
+
+  // Fork workers.
+  for (let i = 0; i < numCPUs; i++) {
+    cluster.fork();
+  }
+
+  cluster.on("exit", (worker, code, signal) => {
+    console.log(`Worker ${worker.process.pid} exited`);
+    console.log("Starting a new worker");
+    cluster.fork();
+  });
+} else {
+  const app = express();

  global.isProduction = process.env.IS_PRODUCTION === "true";

@ -50,14 +69,13 @@ const HOST = process.env.HOST ?? "localhost";
  redisClient.connect();

  // HyperDX OpenTelemetry
-if(process.env.ENV === 'production') {
+  if (process.env.ENV === "production") {
    initSDK({ consoleCapture: true, additionalInstrumentations: [] });
  }

-
-export function startServer(port = DEFAULT_PORT) {
+  function startServer(port = DEFAULT_PORT) {
    const server = app.listen(Number(port), HOST, () => {
-    console.log(`Server listening on port ${port}`);
+      console.log(`Worker ${process.pid} listening on port ${port}`);
      console.log(
        `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
      );
@ -112,7 +130,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
    }
  });

-app.get('/serverHealthCheck/notify', async (req, res) => {
+  app.get("/serverHealthCheck/notify", async (req, res) => {
    if (process.env.SLACK_WEBHOOK_URL) {
      const treshold = 1; // The treshold value for the active jobs
      const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
@ -138,19 +156,21 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
              if (waitingJobsCount >= treshold) {
                const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
                const message = {
-                text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`,
+                  text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
+                    timeout / 60000
+                  } minute(s).`,
                };

                const response = await fetch(slackWebhookUrl, {
-                method: 'POST',
+                  method: "POST",
                  headers: {
-                  'Content-Type': 'application/json',
+                    "Content-Type": "application/json",
                  },
                  body: JSON.stringify(message),
-              })
+                });

                if (!response.ok) {
-                console.error('Failed to send Slack notification')
+                  console.error("Failed to send Slack notification");
                }
              }
            }, timeout);
@ -164,10 +184,36 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
    }
  });

+  app.get(
+    `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
+    async (req, res) => {
+      try {
+        const webScraperQueue = getWebScraperQueue();
+        const completedJobs = await webScraperQueue.getJobs(["completed"]);
+        const before24hJobs = completedJobs.filter(
+          (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
+        );
+        const jobIds = before24hJobs.map((job) => job.id) as string[];
+        let count = 0;
+        for (const jobId of jobIds) {
+          try {
+            await webScraperQueue.removeJobs(jobId);
+            count++;
+          } catch (jobError) {
+            console.error(`Failed to remove job with ID ${jobId}:`, jobError);
+          }
+        }
+        res.status(200).send(`Removed ${count} completed jobs.`);
+      } catch (error) {
+        console.error("Failed to clean last 24h complete jobs:", error);
+        res.status(500).send("Failed to clean jobs");
+      }
+    }
+  );

  app.get("/is-production", (req, res) => {
    res.send({ isProduction: global.isProduction });
  });

-
-// /workers health check, cant act as load balancer, just has to be a pre deploy thing
+  console.log(`Worker ${process.pid} started`);
+}
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -18,6 +18,8 @@ export type PageOptions = {
  waitFor?: number;
  screenshot?: boolean;
  headers?: Record<string, string>;
+  replaceAllPathsWithAbsolutePaths?: boolean;
+  removeTags?: string | string[];
 };

 export type ExtractorOptions = {
@ -35,10 +37,7 @@ export type SearchOptions = {
  location?: string;
 };

-export type WebScraperOptions = {
-  urls: string[];
-  mode: "single_urls" | "sitemap" | "crawl";
-  crawlerOptions?: {
+export type CrawlerOptions = {
  returnOnlyUrls?: boolean;
  includes?: string[];
  excludes?: string[];
@ -47,8 +46,15 @@ export type WebScraperOptions = {
  limit?: number;
  generateImgAltText?: boolean;
  replaceAllPathsWithAbsolutePaths?: boolean;
+  ignoreSitemap?: boolean;
  mode?: "default" | "fast"; // have a mode of some sort
-  };
+  allowBackwardCrawling?: boolean;
+}
+
+export type WebScraperOptions = {
+  urls: string[];
+  mode: "single_urls" | "sitemap" | "crawl";
+  crawlerOptions?: CrawlerOptions;
  pageOptions?: PageOptions;
  extractorOptions?: ExtractorOptions;
  concurrentRequests?: number;
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import async from "async";
-import { Progress } from "../../lib/entities";
+import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";

@ -20,6 +20,7 @@ export class WebCrawler {
  private robotsTxtUrl: string;
  private robots: any;
  private generateImgAltText: boolean;
+  private allowBackwardCrawling: boolean;

  constructor({
    initialUrl,
@ -29,6 +30,7 @@ export class WebCrawler {
    limit = 10000,
    generateImgAltText = false,
    maxCrawledDepth = 10,
+    allowBackwardCrawling = false
  }: {
    initialUrl: string;
    includes?: string[];
@ -37,6 +39,7 @@ export class WebCrawler {
    limit?: number;
    generateImgAltText?: boolean;
    maxCrawledDepth?: number;
+    allowBackwardCrawling?: boolean;
  }) {
    this.initialUrl = initialUrl;
    this.baseUrl = new URL(initialUrl).origin;
@ -49,6 +52,7 @@ export class WebCrawler {
    this.maxCrawledLinks = maxCrawledLinks ?? limit;
    this.maxCrawledDepth = maxCrawledDepth ?? 10;
    this.generateImgAltText = generateImgAltText ?? false;
+    this.allowBackwardCrawling = allowBackwardCrawling ?? false;
  }

  private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
@ -90,10 +94,16 @@ export class WebCrawler {
        const linkHostname = normalizedLink.hostname.replace(/^www\./, '');

        // Ensure the protocol and hostname match, and the path starts with the initial URL's path
-        if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
+        if (linkHostname !== initialHostname) {
          return false;
        }

+        if (!this.allowBackwardCrawling) {
+          if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
+            return false;
+          }
+        }
+
        const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
        // Check if the link is disallowed by robots.txt
        if (!isAllowed) {
@ -108,6 +118,8 @@ export class WebCrawler {

  public async start(
    inProgress?: (progress: Progress) => void,
+    pageOptions?: PageOptions,
+    crawlerOptions?: CrawlerOptions,
    concurrencyLimit: number = 5,
    limit: number = 10000,
    maxDepth: number = 10
@ -122,17 +134,21 @@ export class WebCrawler {
    }


+    if(!crawlerOptions?.ignoreSitemap){
      const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
      if (sitemapLinks.length > 0) {
        let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
        return filteredLinks.map(link => ({ url: link, html: "" }));
      }
+    }

    const urls = await this.crawlUrls(
      [this.initialUrl],
+      pageOptions,
      concurrencyLimit,
      inProgress
    );
+    
    if (
      urls.length === 0 &&
      this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
@ -140,14 +156,15 @@ export class WebCrawler {
      return [{ url: this.initialUrl, html: "" }];
    }

-
    // make sure to run include exclude here again
    const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
+
    return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
  }

  private async crawlUrls(
    urls: string[],
+    pageOptions: PageOptions,
    concurrencyLimit: number,
    inProgress?: (progress: Progress) => void,
  ): Promise<{ url: string, html: string }[]> {
@ -158,7 +175,7 @@ export class WebCrawler {
        }
        return;
      }
-      const newUrls = await this.crawl(task);
+      const newUrls = await this.crawl(task, pageOptions);
      // add the initial url if not already added
      // if (this.visited.size === 1) {
      //   let normalizedInitial = this.initialUrl;
@ -188,7 +205,7 @@ export class WebCrawler {
          currentDocumentUrl: task,
        });
      }
-      await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
+      await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
      if (callback && typeof callback === "function") {
        callback();
      }
@ -207,20 +224,18 @@ export class WebCrawler {
    return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
  }

-  async crawl(url: string): Promise<{url: string, html: string}[]> {
-    if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
+  async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
+    const normalizedUrl = this.normalizeCrawlUrl(url);
+    if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
      return [];
    }
-    this.visited.add(url);
-    
+    this.visited.add(normalizedUrl);

    if (!url.startsWith("http")) {
      url = "https://" + url;
-
    }
    if (url.endsWith("/")) {
      url = url.slice(0, -1);
-
    }

    if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
@ -231,8 +246,8 @@ export class WebCrawler {
      let content: string = "";
      // If it is the first link, fetch with single url
      if (this.visited.size === 1) {
-        const page = await scrapSingleUrl(url, {includeHtml: true});
-        content = page.html ?? ""
+        const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
+        content = page.html ?? "";
      } else {
        const response = await axios.get(url);
        content = response.data ?? "";
@ -241,12 +256,10 @@ export class WebCrawler {
      let links: { url: string, html: string }[] = [];

      // Add the initial URL to the list of links
-      if(this.visited.size === 1)
-      {
+      if (this.visited.size === 1) {
        links.push({ url, html: content });
      }

-
      $("a").each((_, element) => {
        const href = $(element).attr("href");
        if (href) {
@ -254,14 +267,15 @@ export class WebCrawler {
          if (!href.startsWith("http")) {
            fullUrl = new URL(href, this.baseUrl).toString();
          }
-          const url = new URL(fullUrl);
-          const path = url.pathname;
+          const urlObj = new URL(fullUrl);
+          const path = urlObj.pathname;

          if (
            this.isInternalLink(fullUrl) &&
            this.matchesPattern(fullUrl) &&
            this.noSections(fullUrl) &&
-            this.matchesIncludes(path) &&
+            // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
+            // this.matchesIncludes(path) &&
            !this.matchesExcludes(path) &&
            this.robots.isAllowed(fullUrl, "FireCrawlAgent")
          ) {
@ -274,12 +288,22 @@ export class WebCrawler {
        return links;
      }
      // Create a new list to return to avoid modifying the visited list
-      return links.filter((link) => !this.visited.has(link.url));
+      return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
    } catch (error) {
      return [];
    }
  }

+  private normalizeCrawlUrl(url: string): string {
+    try{
+      const urlObj = new URL(url);
+      urlObj.searchParams.sort(); // Sort query parameters to normalize
+      return urlObj.toString();
+    } catch (error) {
+      return url;
+    }
+  }
+
  private matchesIncludes(url: string): boolean {
    if (this.includes.length === 0 || this.includes[0] == "") return true;
    return this.includes.some((pattern) => new RegExp(pattern).test(url));
@ -388,7 +412,6 @@ export class WebCrawler {

    // Normalize and check if the URL is present in any of the sitemaps
    const normalizedUrl = normalizeUrl(url);
-
    const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));

    // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -31,12 +31,14 @@ export class WebScraperDataProvider {
  private limit: number = 10000;
  private concurrentRequests: number = 20;
  private generateImgAltText: boolean = false;
+  private ignoreSitemap: boolean = false;
  private pageOptions?: PageOptions;
  private extractorOptions?: ExtractorOptions;
  private replaceAllPathsWithAbsolutePaths?: boolean = false;
  private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
    "gpt-4-turbo";
  private crawlerMode: string = "default";
+  private allowBackwardCrawling: boolean = false;

  authorize(): void {
    throw new Error("Method not implemented.");
@ -169,10 +171,15 @@ export class WebScraperDataProvider {
      maxCrawledDepth: this.maxCrawledDepth,
      limit: this.limit,
      generateImgAltText: this.generateImgAltText,
+      allowBackwardCrawling: this.allowBackwardCrawling,
    });

    let links = await crawler.start(
      inProgress,
+      this.pageOptions,
+      {
+        ignoreSitemap: this.ignoreSitemap,
+      },
      5,
      this.limit,
      this.maxCrawledDepth
@ -296,9 +303,10 @@ export class WebScraperDataProvider {
  }

  private applyPathReplacements(documents: Document[]): Document[] {
-    return this.replaceAllPathsWithAbsolutePaths
-      ? replacePathsWithAbsolutePaths(documents)
-      : replaceImgPathsWithAbsolutePaths(documents);
+    if (this.replaceAllPathsWithAbsolutePaths) {
+      documents = replacePathsWithAbsolutePaths(documents);
+    }
+    return replaceImgPathsWithAbsolutePaths(documents);
  }

  private async applyImgAltText(documents: Document[]): Promise<Document[]> {
@ -467,12 +475,19 @@ export class WebScraperDataProvider {
    this.limit = options.crawlerOptions?.limit ?? 10000;
    this.generateImgAltText =
      options.crawlerOptions?.generateImgAltText ?? false;
-    this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
+    this.pageOptions = options.pageOptions ?? {
+      onlyMainContent: false,
+      includeHtml: false,
+      replaceAllPathsWithAbsolutePaths: false,
+      removeTags: []
+    };
    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
-    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
+    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
    //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
    this.excludes = this.excludes.filter((item) => item !== "");
    this.crawlerMode = options.crawlerOptions?.mode ?? "default";
+    this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
+    this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;

    // make sure all urls start with https://
    this.urls = this.urls.map((url) => {
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -8,6 +8,7 @@ import { excludeNonMainTags } from "./utils/excludeTags";
 import { urlSpecificParams } from "./utils/custom/website_params";
 import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 import { handleCustomScraping } from "./custom/handleCustomScraping";
+import axios from "axios";

 dotenv.config();

@ -19,6 +20,8 @@ const baseScrapers = [
  "fetch",
 ] as const;

+const universalTimeout = 15000;
+
 export async function generateRequestParams(
  url: string,
  wait_browser: string = "domcontentloaded",
@ -59,21 +62,24 @@ export async function scrapWithFireEngine(
      `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
    );

-    const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", {
-      method: "POST",
-      headers: {
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify({
+    const response = await axios.post(
+      process.env.FIRE_ENGINE_BETA_URL + "/scrape",
+      {
        url: url,
        wait: waitParam,
        screenshot: screenshotParam,
        headers: headers,
-        pageOptions: pageOptions
-      }),
-    });
+        pageOptions: pageOptions,
+      },
+      {
+        headers: {
+          "Content-Type": "application/json",
+        },
+        timeout: universalTimeout + waitParam
+      }
+    );

-    if (!response.ok) {
+    if (response.status !== 200) {
      console.error(
        `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
      );
@ -84,13 +90,17 @@ export async function scrapWithFireEngine(
    if (contentType && contentType.includes("application/pdf")) {
      return { html: await fetchAndProcessPdf(url), screenshot: "" };
    } else {
-      const data = await response.json();
+      const data = response.data;
      const html = data.content;
      const screenshot = data.screenshot;
      return { html: html ?? "", screenshot: screenshot ?? "" };
    }
  } catch (error) {
+    if (error.code === 'ECONNABORTED') {
+      console.log(`[Fire-Engine] Request timed out for ${url}`);
+    } else {
      console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
+    }
    return { html: "", screenshot: "" };
  }
 }
@ -98,7 +108,7 @@ export async function scrapWithFireEngine(
 export async function scrapWithScrapingBee(
  url: string,
  wait_browser: string = "domcontentloaded",
-  timeout: number = 15000
+  timeout: number = universalTimeout
 ): Promise<string> {
  try {
    const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
@ -141,15 +151,19 @@ export async function scrapWithPlaywright(
    // If the user has passed a wait parameter in the request, use that
    const waitParam = reqParams["params"]?.wait ?? waitFor;

-    const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
-      method: "POST",
+    const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
+      url: url,
+      wait_after_load: waitParam,
+      headers: headers,
+    }, {
      headers: {
        "Content-Type": "application/json",
      },
-      body: JSON.stringify({ url: url, wait_after_load: waitParam, headers: headers }),
+      timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
+      transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
    });

-    if (!response.ok) {
+    if (response.status !== 200) {
      console.error(
        `[Playwright] Error fetching url: ${url} with status: ${response.status}`
      );
@ -160,7 +174,7 @@ export async function scrapWithPlaywright(
    if (contentType && contentType.includes("application/pdf")) {
      return fetchAndProcessPdf(url);
    } else {
-      const textData = await response.text();
+      const textData = response.data;
      try {
        const data = JSON.parse(textData);
        const html = data.content;
@ -171,17 +185,28 @@ export async function scrapWithPlaywright(
      }
    }
  } catch (error) {
+    if (error.code === 'ECONNABORTED') {
+      console.log(`[Playwright] Request timed out for ${url}`);
+    } else {
      console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
+    }
    return "";
  }
 }

 export async function scrapWithFetch(url: string): Promise<string> {
  try {
-    const response = await fetch(url);
-    if (!response.ok) {
+    const response = await axios.get(url, {
+      headers: {
+        "Content-Type": "application/json",
+      },
+      timeout: universalTimeout,
+      transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
+    });
+
+    if (response.status !== 200) {
      console.error(
-        `[Fetch] Error fetching url: ${url} with status: ${response.status}`
+        `[Axios] Error fetching url: ${url} with status: ${response.status}`
      );
      return "";
    }
@ -190,11 +215,15 @@ export async function scrapWithFetch(url: string): Promise<string> {
    if (contentType && contentType.includes("application/pdf")) {
      return fetchAndProcessPdf(url);
    } else {
-      const text = await response.text();
+      const text = response.data;
      return text;
    }
  } catch (error) {
-    console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`);
+    if (error.code === 'ECONNABORTED') {
+      console.log(`[Axios] Request timed out for ${url}`);
+    } else {
+      console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
+    }
    return "";
  }
 }
@ -275,6 +304,19 @@ export async function scrapSingleUrl(
  const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
    const soup = cheerio.load(html);
    soup("script, style, iframe, noscript, meta, head").remove();
+
+    if (pageOptions.removeTags) {
+      if (typeof pageOptions.removeTags === 'string') {
+        pageOptions.removeTags.split(',').forEach((tag) => {
+          soup(tag.trim()).remove();
+        });
+      } else if (Array.isArray(pageOptions.removeTags)) {
+        pageOptions.removeTags.forEach((tag) => {
+          soup(tag).remove();
+        });
+      }
+    }
+    
    if (pageOptions.onlyMainContent) {
      // remove any other tags that are not in the main content
      excludeNonMainTags.forEach((tag) => {
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@ -12,6 +12,7 @@ export async function getLinksFromSitemap(
      content = response.data;
    } catch (error) {
      console.error(`Request failed for ${sitemapUrl}: ${error}`);
+
      return allUrls;
    }

--- a/apps/api/src/scraper/WebScraper/utils/tests/replacePaths.test.ts
+++ b/apps/api/src/scraper/WebScraper/utils/tests/replacePaths.test.ts
@ -6,12 +6,14 @@ describe('replacePaths', () => {
    it('should replace relative paths with absolute paths', () => {
      const documents: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).'
+        content: 'This is a [link](/path/to/resource).',
+        markdown: 'This is a [link](/path/to/resource).'
      }];

      const expectedDocuments: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).'
+        content: 'This is a [link](https://example.com/path/to/resource).',
+        markdown: 'This is a [link](https://example.com/path/to/resource).'
      }];

      const result = replacePathsWithAbsolutePaths(documents);
@ -21,7 +23,8 @@ describe('replacePaths', () => {
    it('should not alter absolute URLs', () => {
      const documents: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).'
+        content: 'This is an [external link](https://external.com/path).',
+        markdown: 'This is an [external link](https://external.com/path).'
      }];

      const result = replacePathsWithAbsolutePaths(documents);
@ -31,7 +34,8 @@ describe('replacePaths', () => {
    it('should not alter data URLs for images', () => {
      const documents: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).'
+        content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).',
+        markdown: 'This is an image: ![alt text](data:image/png;base64,ABC123==).'
      }];

      const result = replacePathsWithAbsolutePaths(documents);
@ -41,12 +45,14 @@ describe('replacePaths', () => {
    it('should handle multiple links and images correctly', () => {
      const documents: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).'
+        content: 'Here are two links: [link1](/path1) and [link2](/path2).',
+        markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
      }];

      const expectedDocuments: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).'
+        content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
+        markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
      }];

      const result = replacePathsWithAbsolutePaths(documents);
@ -56,12 +62,14 @@ describe('replacePaths', () => {
    it('should correctly handle a mix of absolute and relative paths', () => {
      const documents: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
+        content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
+        markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
      }];

      const expectedDocuments: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
+        content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
+        markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
      }];

      const result = replacePathsWithAbsolutePaths(documents);
@ -74,12 +82,14 @@ describe('replacePaths', () => {
    it('should replace relative image paths with absolute paths', () => {
      const documents: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'Here is an image: ![alt text](/path/to/image.jpg).'
+        content: 'Here is an image: ![alt text](/path/to/image.jpg).',
+        markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
      }];

      const expectedDocuments: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
+        content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
+        markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
      }];

      const result = replaceImgPathsWithAbsolutePaths(documents);
@ -89,7 +99,8 @@ describe('replacePaths', () => {
    it('should not alter data:image URLs', () => {
      const documents: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).'
+        content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).',
+        markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
      }];

      const result = replaceImgPathsWithAbsolutePaths(documents);
@ -99,12 +110,14 @@ describe('replacePaths', () => {
    it('should handle multiple images with a mix of data and relative URLs', () => {
      const documents: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).'
+        content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).',
+        markdown: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).'
      }];

      const expectedDocuments: Document[] = [{
        metadata: { sourceURL: 'https://example.com' },
-        content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).'
+        content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).',
+        markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).'
      }];

      const result = replaceImgPathsWithAbsolutePaths(documents);
--- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
+++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts
@ -10,6 +10,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
        ) || [];

      paths.forEach((path: string) => {
+        try {
          const isImage = path.startsWith("!");
        let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
        let url = matchedUrl[1];
@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
        }

        const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
-        if (isImage) {
-          document.content = document.content.replace(
-            path,
-            `${markdownLinkOrImageText}(${url})`
-          );
-        } else {
+        // Image is handled afterwards
+        if (!isImage) {
          document.content = document.content.replace(
            path,
            `${markdownLinkOrImageText}(${url})`
          );
+          }
+        } catch (error) {
+          
        }
      });
+      document.markdown = document.content;
    });

    return documents;
@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
          if (!imageUrl.startsWith("http")) {
            if (imageUrl.startsWith("/")) {
              imageUrl = imageUrl.substring(1);
-            }
              imageUrl = new URL(imageUrl, baseUrl).toString();
+            } else {
+              imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
+            }
          }
        }

@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
          `![${altText}](${imageUrl})`
        );
      });
+      document.markdown = document.content;
    });

    return documents;
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -38,7 +38,7 @@ getWebScraperQueue().process(
        error: message /* etc... */,
      };

-      await callWebhook(job.data.team_id, data);
+      await callWebhook(job.data.team_id, job.id as string, data);

      await logJob({
        success: success,
@ -78,7 +78,7 @@ getWebScraperQueue().process(
        error:
          "Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
      };
-      await callWebhook(job.data.team_id, data);
+      await callWebhook(job.data.team_id, job.id as string, data);
      await logJob({
        success: false,
        message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
--- a/apps/api/src/services/redis.ts
+++ b/apps/api/src/services/redis.ts
@ -1,8 +1,35 @@
-import Redis from 'ioredis';
+import Redis from "ioredis";

 // Initialize Redis client
 const redis = new Redis(process.env.REDIS_URL);

+// Listen to 'error' events to the Redis connection
+redis.on("error", (error) => {
+  try {
+    if (error.message === "ECONNRESET") {
+      console.log("Connection to Redis Session Store timed out.");
+    } else if (error.message === "ECONNREFUSED") {
+      console.log("Connection to Redis Session Store refused!");
+    } else console.log(error);
+  } catch (error) {}
+});
+
+// Listen to 'reconnecting' event to Redis
+redis.on("reconnecting", (err) => {
+  try {
+    if (redis.status === "reconnecting")
+      console.log("Reconnecting to Redis Session Store...");
+    else console.log("Error reconnecting to Redis Session Store.");
+  } catch (error) {}
+});
+
+// Listen to the 'connect' event to Redis
+redis.on("connect", (err) => {
+  try {
+    if (!err) console.log("Connected to Redis Session Store!");
+  } catch (error) {}
+});
+
 /**
 * Set a value in Redis with an optional expiration time.
 * @param {string} key The key under which to store the value.
@ -11,7 +38,7 @@ const redis = new Redis(process.env.REDIS_URL);
 */
 const setValue = async (key: string, value: string, expire?: number) => {
  if (expire) {
-    await redis.set(key, value, 'EX', expire);
+    await redis.set(key, value, "EX", expire);
  } else {
    await redis.set(key, value);
  }
--- a/apps/api/src/services/webhook.ts
+++ b/apps/api/src/services/webhook.ts
@ -1,17 +1,19 @@
 import { supabase_service } from "./supabase";

-export const callWebhook = async (teamId: string, data: any) => {
+export const callWebhook = async (teamId: string, jobId: string,data: any) => {
  try {
    const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
+    const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
    let webhookUrl = selfHostedUrl;

-    if (!selfHostedUrl) {
+    // Only fetch the webhook URL from the database if the self-hosted webhook URL is not set
+    // and the USE_DB_AUTHENTICATION environment variable is set to true
+    if (!selfHostedUrl && useDbAuthentication) {
      const { data: webhooksData, error } = await supabase_service
        .from("webhooks")
        .select("url")
        .eq("team_id", teamId)
        .limit(1);
-
      if (error) {
        console.error(
          `Error fetching webhook URL for team ID: ${teamId}`,
@ -45,6 +47,7 @@ export const callWebhook = async (teamId: string, data: any) => {
      },
      body: JSON.stringify({
        success: data.success,
+        jobId: jobId,
        data: dataToSend,
        error: data.error || undefined,
      }),
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -1,3 +1,57 @@
+"""
+This is the Firecrawl package.
+
+This package provides a Python SDK for interacting with the Firecrawl API.
+It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
+and check the status of these jobs.
+
+For more information visit https://github.com/firecrawl/
+"""
+
+import logging
+import os
+
 from .firecrawl import FirecrawlApp

-__version__ = "0.0.14"
+__version__ = "0.0.15"
+
+# Define the logger for the Firecrawl project
+logger: logging.Logger = logging.getLogger("firecrawl")
+
+
+def _basic_config() -> None:
+    """Set up basic configuration for logging with a specific format and date format."""
+    try:
+        logging.basicConfig(
+            format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+    except Exception as e:
+        logger.error("Failed to configure logging: %s", e)
+
+
+def setup_logging() -> None:
+    """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
+    env = os.environ.get(
+        "FIRECRAWL_LOGGING_LEVEL", "INFO"
+    ).upper()  # Default to 'INFO' level
+    _basic_config()
+
+    if env == "DEBUG":
+        logger.setLevel(logging.DEBUG)
+    elif env == "INFO":
+        logger.setLevel(logging.INFO)
+    elif env == "WARNING":
+        logger.setLevel(logging.WARNING)
+    elif env == "ERROR":
+        logger.setLevel(logging.ERROR)
+    elif env == "CRITICAL":
+        logger.setLevel(logging.CRITICAL)
+    else:
+        logger.setLevel(logging.INFO)
+        logger.warning("Unknown logging level: %s, defaulting to INFO", env)
+
+
+# Initialize logging configuration when the module is imported
+setup_logging()
+logger.debug("Debugging logger setup")
--- a/apps/python-sdk/firecrawl/tests/e2e_withAuth/pycache/test.cpython-311-pytest-8.2.1.pyc
+++ b/apps/python-sdk/firecrawl/tests/e2e_withAuth/pycache/test.cpython-311-pytest-8.2.1.pyc
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes.
 Classes:
    - FirecrawlApp: Main class for interacting with the Firecrawl API.
 """
-
+import logging
 import os
 import time
 from typing import Any, Dict, Optional

 import requests

+logger : logging.Logger = logging.getLogger("firecrawl")

 class FirecrawlApp:
    """
@ -28,8 +29,14 @@ class FirecrawlApp:
    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
        self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
        if self.api_key is None:
+            logger.warning("No API key provided")
            raise ValueError('No API key provided')
+        else:
+            logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
+
        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
+        if self.api_url != 'https://api.firecrawl.dev':
+            logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)

    def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
        """
--- a/examples/kubernetes-cluster-install/README.md
+++ b/examples/kubernetes-cluster-install/README.md
--- a/examples/kubernetes-cluster-install/api.yaml
+++ b/examples/kubernetes-cluster-install/api.yaml
--- a/examples/kubernetes-cluster-install/configmap.yaml
+++ b/examples/kubernetes-cluster-install/configmap.yaml
--- a/examples/kubernetes-cluster-install/playwright-service.yaml
+++ b/examples/kubernetes-cluster-install/playwright-service.yaml
--- a/examples/kubernetes-cluster-install/redis.yaml
+++ b/examples/kubernetes-cluster-install/redis.yaml
--- a/examples/kubernetes-cluster-install/secret.yaml
+++ b/examples/kubernetes-cluster-install/secret.yaml
--- a/examples/kubernetes-cluster-install/worker.yaml
+++ b/examples/kubernetes-cluster-install/worker.yaml
--- a/examples/roastmywebsite-example-app/.eslintrc.json
+++ b/examples/roastmywebsite-example-app/.eslintrc.json
--- a/examples/roastmywebsite-example-app/.gitignore
+++ b/examples/roastmywebsite-example-app/.gitignore
--- a/examples/roastmywebsite-example-app/README.md
+++ b/examples/roastmywebsite-example-app/README.md
--- a/examples/roastmywebsite-example-app/components.json
+++ b/examples/roastmywebsite-example-app/components.json
--- a/examples/roastmywebsite-example-app/next.config.mjs
+++ b/examples/roastmywebsite-example-app/next.config.mjs
--- a/examples/roastmywebsite-example-app/package-lock.json
+++ b/examples/roastmywebsite-example-app/package-lock.json
--- a/examples/roastmywebsite-example-app/package.json
+++ b/examples/roastmywebsite-example-app/package.json
--- a/examples/roastmywebsite-example-app/postcss.config.mjs
+++ b/examples/roastmywebsite-example-app/postcss.config.mjs
--- a/examples/roastmywebsite-example-app/public/android-chrome-192x192.png
+++ b/examples/roastmywebsite-example-app/public/android-chrome-192x192.png
--- a/examples/roastmywebsite-example-app/public/android-chrome-512x512.png
+++ b/examples/roastmywebsite-example-app/public/android-chrome-512x512.png
--- a/examples/roastmywebsite-example-app/public/apple-touch-icon.png
+++ b/examples/roastmywebsite-example-app/public/apple-touch-icon.png
--- a/examples/roastmywebsite-example-app/public/bgd.png
+++ b/examples/roastmywebsite-example-app/public/bgd.png
--- a/examples/roastmywebsite-example-app/public/favicon-16x16.png
+++ b/examples/roastmywebsite-example-app/public/favicon-16x16.png
--- a/examples/roastmywebsite-example-app/public/favicon-32x32.png
+++ b/examples/roastmywebsite-example-app/public/favicon-32x32.png
--- a/examples/roastmywebsite-example-app/public/favicon.ico
+++ b/examples/roastmywebsite-example-app/public/favicon.ico
--- a/examples/roastmywebsite-example-app/public/next.svg
+++ b/examples/roastmywebsite-example-app/public/next.svg
--- a/examples/roastmywebsite-example-app/public/og.png
+++ b/examples/roastmywebsite-example-app/public/og.png
--- a/examples/roastmywebsite-example-app/public/site.webmanifest
+++ b/examples/roastmywebsite-example-app/public/site.webmanifest
--- a/examples/roastmywebsite-example-app/public/vercel.svg
+++ b/examples/roastmywebsite-example-app/public/vercel.svg
--- a/examples/roastmywebsite-example-app/src/app/favicon.ico
+++ b/examples/roastmywebsite-example-app/src/app/favicon.ico
--- a/examples/roastmywebsite-example-app/src/app/globals.css
+++ b/examples/roastmywebsite-example-app/src/app/globals.css
--- a/examples/roastmywebsite-example-app/src/app/hooks/useGithubStars.ts
+++ b/examples/roastmywebsite-example-app/src/app/hooks/useGithubStars.ts
--- a/examples/roastmywebsite-example-app/src/app/layout.tsx
+++ b/examples/roastmywebsite-example-app/src/app/layout.tsx
--- a/examples/roastmywebsite-example-app/src/app/page.tsx
+++ b/examples/roastmywebsite-example-app/src/app/page.tsx
--- a/examples/roastmywebsite-example-app/src/components/github-button.tsx
+++ b/examples/roastmywebsite-example-app/src/components/github-button.tsx
--- a/examples/roastmywebsite-example-app/src/components/main.tsx
+++ b/examples/roastmywebsite-example-app/src/components/main.tsx
--- a/examples/roastmywebsite-example-app/src/components/ui/button.tsx
+++ b/examples/roastmywebsite-example-app/src/components/ui/button.tsx
--- a/examples/roastmywebsite-example-app/src/components/ui/dialog.tsx
+++ b/examples/roastmywebsite-example-app/src/components/ui/dialog.tsx
--- a/examples/roastmywebsite-example-app/src/components/ui/dropdown-menu.tsx
+++ b/examples/roastmywebsite-example-app/src/components/ui/dropdown-menu.tsx
--- a/examples/roastmywebsite-example-app/src/components/ui/input.tsx
+++ b/examples/roastmywebsite-example-app/src/components/ui/input.tsx
--- a/examples/roastmywebsite-example-app/src/components/ui/select.tsx
+++ b/examples/roastmywebsite-example-app/src/components/ui/select.tsx
--- a/examples/roastmywebsite-example-app/src/components/ui/sonner.tsx
+++ b/examples/roastmywebsite-example-app/src/components/ui/sonner.tsx
--- a/examples/roastmywebsite-example-app/src/components/ui/switch.tsx
+++ b/examples/roastmywebsite-example-app/src/components/ui/switch.tsx
--- a/examples/roastmywebsite-example-app/src/components/ui/textarea.tsx
+++ b/examples/roastmywebsite-example-app/src/components/ui/textarea.tsx
--- a/examples/roastmywebsite-example-app/src/lib/LLM/llm.ts
+++ b/examples/roastmywebsite-example-app/src/lib/LLM/llm.ts
@ -1,4 +1,4 @@
-import OpenAI from "openai";
+import OpenAI from "openai/index.mjs";
 import { encoding_for_model } from "@dqbd/tiktoken";

 /**
--- a/examples/roastmywebsite-example-app/src/lib/LLM/testing_constants.ts
+++ b/examples/roastmywebsite-example-app/src/lib/LLM/testing_constants.ts
--- a/examples/roastmywebsite-example-app/src/lib/utils.ts
+++ b/examples/roastmywebsite-example-app/src/lib/utils.ts
--- a/examples/roastmywebsite-example-app/src/pages/api/roastWebsite.ts
+++ b/examples/roastmywebsite-example-app/src/pages/api/roastWebsite.ts
--- a/examples/roastmywebsite-example-app/tailwind.config.ts
+++ b/examples/roastmywebsite-example-app/tailwind.config.ts
--- a/examples/roastmywebsite-example-app/tsconfig.json
+++ b/examples/roastmywebsite-example-app/tsconfig.json
--- a/examples/web-data-contradiction-testing-using-llms.mdx
+++ b/examples/web-data-contradiction-testing-using-llms.mdx
--- a/examples/web-data-extraction-using-llms.mdx
+++ b/examples/web-data-extraction-using-llms.mdx
--- a/examples/web-data-rag--with-llama3.mdx
+++ b/examples/web-data-rag--with-llama3.mdx