Merge branch 'main' into py-sdk-improve-response-handling

This commit is contained in:
Rafael Miller 2024-06-14 10:42:51 -03:00 committed by GitHub
commit 5a5c532bea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
78 changed files with 666 additions and 260 deletions

View File

@ -0,0 +1,20 @@
name: Clean Before 24h Completed Jobs
on:
schedule:
- cron: '0 0 * * *'
env:
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
jobs:
clean-jobs:
runs-on: ubuntu-latest
steps:
- name: Send GET request to clean jobs
run: |
response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs)
if [ "$response" -ne 200 ]; then
echo "Failed to clean jobs. Response: $response"
exit 1
fi
echo "Successfully cleaned jobs. Response: $response"

View File

@ -183,6 +183,7 @@ jobs:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
build-and-publish-python-sdk: build-and-publish-python-sdk:
name: Build and publish Python SDK
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: deploy needs: deploy
@ -213,7 +214,7 @@ jobs:
working-directory: ./apps/python-sdk working-directory: ./apps/python-sdk
- name: Publish to PyPI - name: Publish to PyPI
if: ${{ env.VERSION_INCREMENTED == 'true' }} if: ${{ env.PYTHON_SDK_VERSION_INCREMENTED == 'true' }}
env: env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
@ -222,6 +223,7 @@ jobs:
working-directory: ./apps/python-sdk working-directory: ./apps/python-sdk
build-and-publish-js-sdk: build-and-publish-js-sdk:
name: Build and publish JavaScript SDK
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: deploy needs: deploy

View File

@ -54,7 +54,7 @@ kill_timeout = '5s'
soft_limit = 12 soft_limit = 12
[[vm]] [[vm]]
size = 'performance-8x' size = 'performance-4x'
processes = ['app'] processes = ['app']

View File

@ -51,10 +51,26 @@
"description": "Include the raw HTML content of the page. Will output a html key in the response.", "description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false "default": false
}, },
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"waitFor": { "waitFor": {
"type": "integer", "type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content", "description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0 "default": 0
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
} }
} }
}, },
@ -176,10 +192,20 @@
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.", "description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
"default": "default" "default": "default"
}, },
"ignoreSitemap": {
"type": "boolean",
"description": "Ignore the website sitemap when crawling",
"default": false
},
"limit": { "limit": {
"type": "integer", "type": "integer",
"description": "Maximum number of pages to crawl", "description": "Maximum number of pages to crawl",
"default": 10000 "default": 10000
},
"allowBackwardCrawling": {
"type": "boolean",
"description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
"default": false
} }
} }
}, },
@ -195,6 +221,27 @@
"type": "boolean", "type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.", "description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false "default": false
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"headers": {
"type": "object",
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
},
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
} }
} }
} }
@ -368,7 +415,7 @@
"items": { "items": {
"$ref": "#/components/schemas/CrawlStatusResponseObj" "$ref": "#/components/schemas/CrawlStatusResponseObj"
}, },
"description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled." "description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
} }
} }
} }
@ -513,6 +560,10 @@
"nullable": true, "nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true" "description": "Raw HTML content of the page if `includeHtml` is true"
}, },
"index": {
"type": "integer",
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
},
"metadata": { "metadata": {
"type": "object", "type": "object",
"properties": { "properties": {

View File

@ -1,5 +1,4 @@
import request from "supertest"; import request from "supertest";
import { app } from "../../index";
import dotenv from "dotenv"; import dotenv from "dotenv";
const fs = require("fs"); const fs = require("fs");
const path = require("path"); const path = require("path");

View File

@ -1,5 +1,4 @@
import request from "supertest"; import request from "supertest";
import { app } from "../../index";
import dotenv from "dotenv"; import dotenv from "dotenv";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
@ -35,7 +34,7 @@ describe("E2E Tests for API Routes", () => {
describe("POST /v0/scrape", () => { describe("POST /v0/scrape", () => {
it.concurrent("should require authorization", async () => { it.concurrent("should require authorization", async () => {
const response = await request(app).post("/v0/scrape"); const response = await request(TEST_URL).post("/v0/scrape");
expect(response.statusCode).toBe(401); expect(response.statusCode).toBe(401);
}); });
@ -136,6 +135,40 @@ describe("E2E Tests for API Routes", () => {
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy'); expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
}, 60000); // 60 seconds }, 60000); // 60 seconds
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
const responseWithoutRemoveTags = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://www.scrapethissite.com/" });
expect(responseWithoutRemoveTags.statusCode).toBe(200);
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("content");
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.content).toContain("Scrape This Site");
expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
expect(response.body.data.content).not.toContain("web scraping"); // strong
}, 30000); // 30 seconds timeout
// TODO: add this test back once we nail the waitFor option to be more deterministic // TODO: add this test back once we nail the waitFor option to be more deterministic
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => { // it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
// const startTime = Date.now(); // const startTime = Date.now();
@ -596,7 +629,7 @@ describe("E2E Tests for API Routes", () => {
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: "https://roastmywebsite.ai" }); .send({ url: "https://mendable.ai/blog" });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false; let isCompleted = false;
@ -622,7 +655,13 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata"); expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("_Roast_"); expect(completedResponse.body.data[0].content).toContain("Mendable");
const childrenLinks = completedResponse.body.data.filter(doc =>
doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
);
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
}, 120000); // 120 seconds }, 120000); // 120 seconds
it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => { it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
@ -757,34 +796,82 @@ describe("E2E Tests for API Routes", () => {
}, 60000); }, 60000);
}); // 60 seconds }); // 60 seconds
it.concurrent("should return a successful response for a valid crawl job with allowBackwardCrawling set to true option", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai/blog",
pageOptions: { includeHtml: true },
crawlerOptions: { allowBackwardCrawling: true },
});
expect(crawlResponse.statusCode).toBe(200);
let isFinished = false;
let completedResponse;
while (!isFinished) {
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
if (response.body.status === "completed") {
isFinished = true;
completedResponse = response;
} else {
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
}
}
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0]).toHaveProperty("html");
expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].markdown).toContain("Mendable");
const onlyChildrenLinks = completedResponse.body.data.filter(doc => {
return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
});
expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length);
}, 60000);
it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => { it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
const crawlResponse = await request(TEST_URL) const crawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ url: "https://jestjs.io" }); .send({ url: "https://jestjs.io" });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
// wait for 30 seconds
await new Promise((r) => setTimeout(r, 20000)); await new Promise((r) => setTimeout(r, 20000));
const response = await request(TEST_URL) const responseCancel = await request(TEST_URL)
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`) .delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200); expect(responseCancel.statusCode).toBe(200);
expect(response.body).toHaveProperty("status"); expect(responseCancel.body).toHaveProperty("status");
expect(response.body.status).toBe("cancelled"); expect(responseCancel.body.status).toBe("cancelled");
await new Promise((r) => setTimeout(r, 10000)); await new Promise((r) => setTimeout(r, 10000));
const completedResponse = await request(TEST_URL) const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(completedResponse.statusCode).toBe(200); expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("failed"); expect(completedResponse.body.status).toBe("failed");
expect(completedResponse.body).toHaveProperty("data"); expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data).toEqual(null); expect(completedResponse.body.data).toBeNull();
expect(completedResponse.body).toHaveProperty("partial_data"); expect(completedResponse.body).toHaveProperty("partial_data");
expect(completedResponse.body.partial_data[0]).toHaveProperty("content"); expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown"); expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");

View File

@ -143,7 +143,7 @@ export async function supaAuthenticateUser(
const startDate = new Date(); const startDate = new Date();
const endDate = new Date(); const endDate = new Date();
endDate.setDate(endDate.getDate() + 7); endDate.setDate(endDate.getDate() + 7);
await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString()); // await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
return { return {
success: false, success: false,
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`, error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,

View File

@ -55,8 +55,14 @@ export async function crawlController(req: Request, res: Response) {
} }
const mode = req.body.mode ?? "crawl"; const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; allowBackwardCrawling: false
};
const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
removeTags: []
};
if (mode === "single_urls" && !url.includes(",")) { if (mode === "single_urls" && !url.includes(",")) {
try { try {
@ -64,9 +70,7 @@ export async function crawlController(req: Request, res: Response) {
await a.setOptions({ await a.setOptions({
mode: "single_urls", mode: "single_urls",
urls: [url], urls: [url],
crawlerOptions: { crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
returnOnlyUrls: true,
},
pageOptions: pageOptions, pageOptions: pageOptions,
}); });
@ -91,7 +95,7 @@ export async function crawlController(req: Request, res: Response) {
const job = await addWebScraperJob({ const job = await addWebScraperJob({
url: url, url: url,
mode: mode ?? "crawl", // fix for single urls not working mode: mode ?? "crawl", // fix for single urls not working
crawlerOptions: { ...crawlerOptions }, crawlerOptions: crawlerOptions,
team_id: team_id, team_id: team_id,
pageOptions: pageOptions, pageOptions: pageOptions,
origin: req.body.origin ?? "api", origin: req.body.origin ?? "api",

View File

@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
const mode = req.body.mode ?? "crawl"; const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false }; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
const job = await addWebScraperJob({ const job = await addWebScraperJob({
url: url, url: url,

View File

@ -85,6 +85,7 @@ export async function searchHelper(
onlyMainContent: pageOptions?.onlyMainContent ?? true, onlyMainContent: pageOptions?.onlyMainContent ?? true,
fetchPageContent: pageOptions?.fetchPageContent ?? true, fetchPageContent: pageOptions?.fetchPageContent ?? true,
includeHtml: pageOptions?.includeHtml ?? false, includeHtml: pageOptions?.includeHtml ?? false,
removeTags: pageOptions?.removeTags ?? [],
fallback: false, fallback: false,
}, },
}); });
@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
includeHtml: false, includeHtml: false,
onlyMainContent: true, onlyMainContent: true,
fetchPageContent: true, fetchPageContent: true,
removeTags: [],
fallback: false, fallback: false,
}; };
const origin = req.body.origin ?? "api"; const origin = req.body.origin ?? "api";

View File

@ -5,13 +5,32 @@ import "dotenv/config";
import { getWebScraperQueue } from "./services/queue-service"; import { getWebScraperQueue } from "./services/queue-service";
import { redisClient } from "./services/rate-limiter"; import { redisClient } from "./services/rate-limiter";
import { v0Router } from "./routes/v0"; import { v0Router } from "./routes/v0";
import { initSDK } from '@hyperdx/node-opentelemetry'; import { initSDK } from "@hyperdx/node-opentelemetry";
import cluster from "cluster";
import os from "os";
const { createBullBoard } = require("@bull-board/api"); const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { BullAdapter } = require("@bull-board/api/bullAdapter");
const { ExpressAdapter } = require("@bull-board/express"); const { ExpressAdapter } = require("@bull-board/express");
export const app = express(); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
console.log(`Number of CPUs: ${numCPUs} available`);
if (cluster.isMaster) {
console.log(`Master ${process.pid} is running`);
// Fork workers.
for (let i = 0; i < numCPUs; i++) {
cluster.fork();
}
cluster.on("exit", (worker, code, signal) => {
console.log(`Worker ${worker.process.pid} exited`);
console.log("Starting a new worker");
cluster.fork();
});
} else {
const app = express();
global.isProduction = process.env.IS_PRODUCTION === "true"; global.isProduction = process.env.IS_PRODUCTION === "true";
@ -50,14 +69,13 @@ const HOST = process.env.HOST ?? "localhost";
redisClient.connect(); redisClient.connect();
// HyperDX OpenTelemetry // HyperDX OpenTelemetry
if(process.env.ENV === 'production') { if (process.env.ENV === "production") {
initSDK({ consoleCapture: true, additionalInstrumentations: [] }); initSDK({ consoleCapture: true, additionalInstrumentations: [] });
} }
function startServer(port = DEFAULT_PORT) {
export function startServer(port = DEFAULT_PORT) {
const server = app.listen(Number(port), HOST, () => { const server = app.listen(Number(port), HOST, () => {
console.log(`Server listening on port ${port}`); console.log(`Worker ${process.pid} listening on port ${port}`);
console.log( console.log(
`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
); );
@ -112,7 +130,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
} }
}); });
app.get('/serverHealthCheck/notify', async (req, res) => { app.get("/serverHealthCheck/notify", async (req, res) => {
if (process.env.SLACK_WEBHOOK_URL) { if (process.env.SLACK_WEBHOOK_URL) {
const treshold = 1; // The treshold value for the active jobs const treshold = 1; // The treshold value for the active jobs
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
@ -138,19 +156,21 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
if (waitingJobsCount >= treshold) { if (waitingJobsCount >= treshold) {
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
const message = { const message = {
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`, text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
timeout / 60000
} minute(s).`,
}; };
const response = await fetch(slackWebhookUrl, { const response = await fetch(slackWebhookUrl, {
method: 'POST', method: "POST",
headers: { headers: {
'Content-Type': 'application/json', "Content-Type": "application/json",
}, },
body: JSON.stringify(message), body: JSON.stringify(message),
}) });
if (!response.ok) { if (!response.ok) {
console.error('Failed to send Slack notification') console.error("Failed to send Slack notification");
} }
} }
}, timeout); }, timeout);
@ -164,10 +184,36 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
} }
}); });
app.get(
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
async (req, res) => {
try {
const webScraperQueue = getWebScraperQueue();
const completedJobs = await webScraperQueue.getJobs(["completed"]);
const before24hJobs = completedJobs.filter(
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
);
const jobIds = before24hJobs.map((job) => job.id) as string[];
let count = 0;
for (const jobId of jobIds) {
try {
await webScraperQueue.removeJobs(jobId);
count++;
} catch (jobError) {
console.error(`Failed to remove job with ID ${jobId}:`, jobError);
}
}
res.status(200).send(`Removed ${count} completed jobs.`);
} catch (error) {
console.error("Failed to clean last 24h complete jobs:", error);
res.status(500).send("Failed to clean jobs");
}
}
);
app.get("/is-production", (req, res) => { app.get("/is-production", (req, res) => {
res.send({ isProduction: global.isProduction }); res.send({ isProduction: global.isProduction });
}); });
console.log(`Worker ${process.pid} started`);
// /workers health check, cant act as load balancer, just has to be a pre deploy thing }

View File

@ -18,6 +18,8 @@ export type PageOptions = {
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
headers?: Record<string, string>; headers?: Record<string, string>;
replaceAllPathsWithAbsolutePaths?: boolean;
removeTags?: string | string[];
}; };
export type ExtractorOptions = { export type ExtractorOptions = {
@ -35,10 +37,7 @@ export type SearchOptions = {
location?: string; location?: string;
}; };
export type WebScraperOptions = { export type CrawlerOptions = {
urls: string[];
mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: {
returnOnlyUrls?: boolean; returnOnlyUrls?: boolean;
includes?: string[]; includes?: string[];
excludes?: string[]; excludes?: string[];
@ -47,8 +46,15 @@ export type WebScraperOptions = {
limit?: number; limit?: number;
generateImgAltText?: boolean; generateImgAltText?: boolean;
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
ignoreSitemap?: boolean;
mode?: "default" | "fast"; // have a mode of some sort mode?: "default" | "fast"; // have a mode of some sort
}; allowBackwardCrawling?: boolean;
}
export type WebScraperOptions = {
urls: string[];
mode: "single_urls" | "sitemap" | "crawl";
crawlerOptions?: CrawlerOptions;
pageOptions?: PageOptions; pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions; extractorOptions?: ExtractorOptions;
concurrentRequests?: number; concurrentRequests?: number;

View File

@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
import { URL } from "url"; import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
import async from "async"; import async from "async";
import { Progress } from "../../lib/entities"; import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
@ -20,6 +20,7 @@ export class WebCrawler {
private robotsTxtUrl: string; private robotsTxtUrl: string;
private robots: any; private robots: any;
private generateImgAltText: boolean; private generateImgAltText: boolean;
private allowBackwardCrawling: boolean;
constructor({ constructor({
initialUrl, initialUrl,
@ -29,6 +30,7 @@ export class WebCrawler {
limit = 10000, limit = 10000,
generateImgAltText = false, generateImgAltText = false,
maxCrawledDepth = 10, maxCrawledDepth = 10,
allowBackwardCrawling = false
}: { }: {
initialUrl: string; initialUrl: string;
includes?: string[]; includes?: string[];
@ -37,6 +39,7 @@ export class WebCrawler {
limit?: number; limit?: number;
generateImgAltText?: boolean; generateImgAltText?: boolean;
maxCrawledDepth?: number; maxCrawledDepth?: number;
allowBackwardCrawling?: boolean;
}) { }) {
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin; this.baseUrl = new URL(initialUrl).origin;
@ -49,6 +52,7 @@ export class WebCrawler {
this.maxCrawledLinks = maxCrawledLinks ?? limit; this.maxCrawledLinks = maxCrawledLinks ?? limit;
this.maxCrawledDepth = maxCrawledDepth ?? 10; this.maxCrawledDepth = maxCrawledDepth ?? 10;
this.generateImgAltText = generateImgAltText ?? false; this.generateImgAltText = generateImgAltText ?? false;
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
} }
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
@ -90,10 +94,16 @@ export class WebCrawler {
const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
// Ensure the protocol and hostname match, and the path starts with the initial URL's path // Ensure the protocol and hostname match, and the path starts with the initial URL's path
if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { if (linkHostname !== initialHostname) {
return false; return false;
} }
if (!this.allowBackwardCrawling) {
if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
return false;
}
}
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
// Check if the link is disallowed by robots.txt // Check if the link is disallowed by robots.txt
if (!isAllowed) { if (!isAllowed) {
@ -108,6 +118,8 @@ export class WebCrawler {
public async start( public async start(
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
pageOptions?: PageOptions,
crawlerOptions?: CrawlerOptions,
concurrencyLimit: number = 5, concurrencyLimit: number = 5,
limit: number = 10000, limit: number = 10000,
maxDepth: number = 10 maxDepth: number = 10
@ -122,17 +134,21 @@ export class WebCrawler {
} }
if(!crawlerOptions?.ignoreSitemap){
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks.map(link => ({ url: link, html: "" })); return filteredLinks.map(link => ({ url: link, html: "" }));
} }
}
const urls = await this.crawlUrls( const urls = await this.crawlUrls(
[this.initialUrl], [this.initialUrl],
pageOptions,
concurrencyLimit, concurrencyLimit,
inProgress inProgress
); );
if ( if (
urls.length === 0 && urls.length === 0 &&
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
@ -140,14 +156,15 @@ export class WebCrawler {
return [{ url: this.initialUrl, html: "" }]; return [{ url: this.initialUrl, html: "" }];
} }
// make sure to run include exclude here again // make sure to run include exclude here again
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
} }
private async crawlUrls( private async crawlUrls(
urls: string[], urls: string[],
pageOptions: PageOptions,
concurrencyLimit: number, concurrencyLimit: number,
inProgress?: (progress: Progress) => void, inProgress?: (progress: Progress) => void,
): Promise<{ url: string, html: string }[]> { ): Promise<{ url: string, html: string }[]> {
@ -158,7 +175,7 @@ export class WebCrawler {
} }
return; return;
} }
const newUrls = await this.crawl(task); const newUrls = await this.crawl(task, pageOptions);
// add the initial url if not already added // add the initial url if not already added
// if (this.visited.size === 1) { // if (this.visited.size === 1) {
// let normalizedInitial = this.initialUrl; // let normalizedInitial = this.initialUrl;
@ -188,7 +205,7 @@ export class WebCrawler {
currentDocumentUrl: task, currentDocumentUrl: task,
}); });
} }
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress); await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
@ -207,20 +224,18 @@ export class WebCrawler {
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
async crawl(url: string): Promise<{url: string, html: string}[]> { async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){ const normalizedUrl = this.normalizeCrawlUrl(url);
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return []; return [];
} }
this.visited.add(url); this.visited.add(normalizedUrl);
if (!url.startsWith("http")) { if (!url.startsWith("http")) {
url = "https://" + url; url = "https://" + url;
} }
if (url.endsWith("/")) { if (url.endsWith("/")) {
url = url.slice(0, -1); url = url.slice(0, -1);
} }
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
@ -231,8 +246,8 @@ export class WebCrawler {
let content: string = ""; let content: string = "";
// If it is the first link, fetch with single url // If it is the first link, fetch with single url
if (this.visited.size === 1) { if (this.visited.size === 1) {
const page = await scrapSingleUrl(url, {includeHtml: true}); const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
content = page.html ?? "" content = page.html ?? "";
} else { } else {
const response = await axios.get(url); const response = await axios.get(url);
content = response.data ?? ""; content = response.data ?? "";
@ -241,12 +256,10 @@ export class WebCrawler {
let links: { url: string, html: string }[] = []; let links: { url: string, html: string }[] = [];
// Add the initial URL to the list of links // Add the initial URL to the list of links
if(this.visited.size === 1) if (this.visited.size === 1) {
{
links.push({ url, html: content }); links.push({ url, html: content });
} }
$("a").each((_, element) => { $("a").each((_, element) => {
const href = $(element).attr("href"); const href = $(element).attr("href");
if (href) { if (href) {
@ -254,14 +267,15 @@ export class WebCrawler {
if (!href.startsWith("http")) { if (!href.startsWith("http")) {
fullUrl = new URL(href, this.baseUrl).toString(); fullUrl = new URL(href, this.baseUrl).toString();
} }
const url = new URL(fullUrl); const urlObj = new URL(fullUrl);
const path = url.pathname; const path = urlObj.pathname;
if ( if (
this.isInternalLink(fullUrl) && this.isInternalLink(fullUrl) &&
this.matchesPattern(fullUrl) && this.matchesPattern(fullUrl) &&
this.noSections(fullUrl) && this.noSections(fullUrl) &&
this.matchesIncludes(path) && // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
// this.matchesIncludes(path) &&
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent") this.robots.isAllowed(fullUrl, "FireCrawlAgent")
) { ) {
@ -274,12 +288,22 @@ export class WebCrawler {
return links; return links;
} }
// Create a new list to return to avoid modifying the visited list // Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(link.url)); return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
} catch (error) { } catch (error) {
return []; return [];
} }
} }
private normalizeCrawlUrl(url: string): string {
try{
const urlObj = new URL(url);
urlObj.searchParams.sort(); // Sort query parameters to normalize
return urlObj.toString();
} catch (error) {
return url;
}
}
private matchesIncludes(url: string): boolean { private matchesIncludes(url: string): boolean {
if (this.includes.length === 0 || this.includes[0] == "") return true; if (this.includes.length === 0 || this.includes[0] == "") return true;
return this.includes.some((pattern) => new RegExp(pattern).test(url)); return this.includes.some((pattern) => new RegExp(pattern).test(url));
@ -388,7 +412,6 @@ export class WebCrawler {
// Normalize and check if the URL is present in any of the sitemaps // Normalize and check if the URL is present in any of the sitemaps
const normalizedUrl = normalizeUrl(url); const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl

View File

@ -31,12 +31,14 @@ export class WebScraperDataProvider {
private limit: number = 10000; private limit: number = 10000;
private concurrentRequests: number = 20; private concurrentRequests: number = 20;
private generateImgAltText: boolean = false; private generateImgAltText: boolean = false;
private ignoreSitemap: boolean = false;
private pageOptions?: PageOptions; private pageOptions?: PageOptions;
private extractorOptions?: ExtractorOptions; private extractorOptions?: ExtractorOptions;
private replaceAllPathsWithAbsolutePaths?: boolean = false; private replaceAllPathsWithAbsolutePaths?: boolean = false;
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
"gpt-4-turbo"; "gpt-4-turbo";
private crawlerMode: string = "default"; private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false;
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -169,10 +171,15 @@ export class WebScraperDataProvider {
maxCrawledDepth: this.maxCrawledDepth, maxCrawledDepth: this.maxCrawledDepth,
limit: this.limit, limit: this.limit,
generateImgAltText: this.generateImgAltText, generateImgAltText: this.generateImgAltText,
allowBackwardCrawling: this.allowBackwardCrawling,
}); });
let links = await crawler.start( let links = await crawler.start(
inProgress, inProgress,
this.pageOptions,
{
ignoreSitemap: this.ignoreSitemap,
},
5, 5,
this.limit, this.limit,
this.maxCrawledDepth this.maxCrawledDepth
@ -296,9 +303,10 @@ export class WebScraperDataProvider {
} }
private applyPathReplacements(documents: Document[]): Document[] { private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths if (this.replaceAllPathsWithAbsolutePaths) {
? replacePathsWithAbsolutePaths(documents) documents = replacePathsWithAbsolutePaths(documents);
: replaceImgPathsWithAbsolutePaths(documents); }
return replaceImgPathsWithAbsolutePaths(documents);
} }
private async applyImgAltText(documents: Document[]): Promise<Document[]> { private async applyImgAltText(documents: Document[]): Promise<Document[]> {
@ -467,12 +475,19 @@ export class WebScraperDataProvider {
this.limit = options.crawlerOptions?.limit ?? 10000; this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText = this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false; options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false }; this.pageOptions = options.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
replaceAllPathsWithAbsolutePaths: false,
removeTags: []
};
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"} this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== ""); this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.crawlerMode = options.crawlerOptions?.mode ?? "default";
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -8,6 +8,7 @@ import { excludeNonMainTags } from "./utils/excludeTags";
import { urlSpecificParams } from "./utils/custom/website_params"; import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor"; import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { handleCustomScraping } from "./custom/handleCustomScraping"; import { handleCustomScraping } from "./custom/handleCustomScraping";
import axios from "axios";
dotenv.config(); dotenv.config();
@ -19,6 +20,8 @@ const baseScrapers = [
"fetch", "fetch",
] as const; ] as const;
const universalTimeout = 15000;
export async function generateRequestParams( export async function generateRequestParams(
url: string, url: string,
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
@ -59,21 +62,24 @@ export async function scrapWithFireEngine(
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}` `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
); );
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", { const response = await axios.post(
method: "POST", process.env.FIRE_ENGINE_BETA_URL + "/scrape",
headers: { {
"Content-Type": "application/json",
},
body: JSON.stringify({
url: url, url: url,
wait: waitParam, wait: waitParam,
screenshot: screenshotParam, screenshot: screenshotParam,
headers: headers, headers: headers,
pageOptions: pageOptions pageOptions: pageOptions,
}), },
}); {
headers: {
"Content-Type": "application/json",
},
timeout: universalTimeout + waitParam
}
);
if (!response.ok) { if (response.status !== 200) {
console.error( console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
); );
@ -84,13 +90,17 @@ export async function scrapWithFireEngine(
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return { html: await fetchAndProcessPdf(url), screenshot: "" }; return { html: await fetchAndProcessPdf(url), screenshot: "" };
} else { } else {
const data = await response.json(); const data = response.data;
const html = data.content; const html = data.content;
const screenshot = data.screenshot; const screenshot = data.screenshot;
return { html: html ?? "", screenshot: screenshot ?? "" }; return { html: html ?? "", screenshot: screenshot ?? "" };
} }
} catch (error) { } catch (error) {
if (error.code === 'ECONNABORTED') {
console.log(`[Fire-Engine] Request timed out for ${url}`);
} else {
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
}
return { html: "", screenshot: "" }; return { html: "", screenshot: "" };
} }
} }
@ -98,7 +108,7 @@ export async function scrapWithFireEngine(
export async function scrapWithScrapingBee( export async function scrapWithScrapingBee(
url: string, url: string,
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
timeout: number = 15000 timeout: number = universalTimeout
): Promise<string> { ): Promise<string> {
try { try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
@ -141,15 +151,19 @@ export async function scrapWithPlaywright(
// If the user has passed a wait parameter in the request, use that // If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor; const waitParam = reqParams["params"]?.wait ?? waitFor;
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
method: "POST", url: url,
wait_after_load: waitParam,
headers: headers,
}, {
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
body: JSON.stringify({ url: url, wait_after_load: waitParam, headers: headers }), timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
}); });
if (!response.ok) { if (response.status !== 200) {
console.error( console.error(
`[Playwright] Error fetching url: ${url} with status: ${response.status}` `[Playwright] Error fetching url: ${url} with status: ${response.status}`
); );
@ -160,7 +174,7 @@ export async function scrapWithPlaywright(
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url);
} else { } else {
const textData = await response.text(); const textData = response.data;
try { try {
const data = JSON.parse(textData); const data = JSON.parse(textData);
const html = data.content; const html = data.content;
@ -171,17 +185,28 @@ export async function scrapWithPlaywright(
} }
} }
} catch (error) { } catch (error) {
if (error.code === 'ECONNABORTED') {
console.log(`[Playwright] Request timed out for ${url}`);
} else {
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
}
return ""; return "";
} }
} }
export async function scrapWithFetch(url: string): Promise<string> { export async function scrapWithFetch(url: string): Promise<string> {
try { try {
const response = await fetch(url); const response = await axios.get(url, {
if (!response.ok) { headers: {
"Content-Type": "application/json",
},
timeout: universalTimeout,
transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
});
if (response.status !== 200) {
console.error( console.error(
`[Fetch] Error fetching url: ${url} with status: ${response.status}` `[Axios] Error fetching url: ${url} with status: ${response.status}`
); );
return ""; return "";
} }
@ -190,11 +215,15 @@ export async function scrapWithFetch(url: string): Promise<string> {
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return fetchAndProcessPdf(url); return fetchAndProcessPdf(url);
} else { } else {
const text = await response.text(); const text = response.data;
return text; return text;
} }
} catch (error) { } catch (error) {
console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`); if (error.code === 'ECONNABORTED') {
console.log(`[Axios] Request timed out for ${url}`);
} else {
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
}
return ""; return "";
} }
} }
@ -275,6 +304,19 @@ export async function scrapSingleUrl(
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
const soup = cheerio.load(html); const soup = cheerio.load(html);
soup("script, style, iframe, noscript, meta, head").remove(); soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.removeTags) {
if (typeof pageOptions.removeTags === 'string') {
pageOptions.removeTags.split(',').forEach((tag) => {
soup(tag.trim()).remove();
});
} else if (Array.isArray(pageOptions.removeTags)) {
pageOptions.removeTags.forEach((tag) => {
soup(tag).remove();
});
}
}
if (pageOptions.onlyMainContent) { if (pageOptions.onlyMainContent) {
// remove any other tags that are not in the main content // remove any other tags that are not in the main content
excludeNonMainTags.forEach((tag) => { excludeNonMainTags.forEach((tag) => {

View File

@ -12,6 +12,7 @@ export async function getLinksFromSitemap(
content = response.data; content = response.data;
} catch (error) { } catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`); console.error(`Request failed for ${sitemapUrl}: ${error}`);
return allUrls; return allUrls;
} }

View File

@ -6,12 +6,14 @@ describe('replacePaths', () => {
it('should replace relative paths with absolute paths', () => { it('should replace relative paths with absolute paths', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).' content: 'This is a [link](/path/to/resource).',
markdown: 'This is a [link](/path/to/resource).'
}]; }];
const expectedDocuments: Document[] = [{ const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).' content: 'This is a [link](https://example.com/path/to/resource).',
markdown: 'This is a [link](https://example.com/path/to/resource).'
}]; }];
const result = replacePathsWithAbsolutePaths(documents); const result = replacePathsWithAbsolutePaths(documents);
@ -21,7 +23,8 @@ describe('replacePaths', () => {
it('should not alter absolute URLs', () => { it('should not alter absolute URLs', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).' content: 'This is an [external link](https://external.com/path).',
markdown: 'This is an [external link](https://external.com/path).'
}]; }];
const result = replacePathsWithAbsolutePaths(documents); const result = replacePathsWithAbsolutePaths(documents);
@ -31,7 +34,8 @@ describe('replacePaths', () => {
it('should not alter data URLs for images', () => { it('should not alter data URLs for images', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).' content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).',
markdown: 'This is an image: ![alt text](data:image/png;base64,ABC123==).'
}]; }];
const result = replacePathsWithAbsolutePaths(documents); const result = replacePathsWithAbsolutePaths(documents);
@ -41,12 +45,14 @@ describe('replacePaths', () => {
it('should handle multiple links and images correctly', () => { it('should handle multiple links and images correctly', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).' content: 'Here are two links: [link1](/path1) and [link2](/path2).',
markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
}]; }];
const expectedDocuments: Document[] = [{ const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).' content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
}]; }];
const result = replacePathsWithAbsolutePaths(documents); const result = replacePathsWithAbsolutePaths(documents);
@ -56,12 +62,14 @@ describe('replacePaths', () => {
it('should correctly handle a mix of absolute and relative paths', () => { it('should correctly handle a mix of absolute and relative paths', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
}]; }];
const expectedDocuments: Document[] = [{ const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).' content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
}]; }];
const result = replacePathsWithAbsolutePaths(documents); const result = replacePathsWithAbsolutePaths(documents);
@ -74,12 +82,14 @@ describe('replacePaths', () => {
it('should replace relative image paths with absolute paths', () => { it('should replace relative image paths with absolute paths', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Here is an image: ![alt text](/path/to/image.jpg).' content: 'Here is an image: ![alt text](/path/to/image.jpg).',
markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
}]; }];
const expectedDocuments: Document[] = [{ const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
}]; }];
const result = replaceImgPathsWithAbsolutePaths(documents); const result = replaceImgPathsWithAbsolutePaths(documents);
@ -89,7 +99,8 @@ describe('replacePaths', () => {
it('should not alter data:image URLs', () => { it('should not alter data:image URLs', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).' content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).',
markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
}]; }];
const result = replaceImgPathsWithAbsolutePaths(documents); const result = replaceImgPathsWithAbsolutePaths(documents);
@ -99,12 +110,14 @@ describe('replacePaths', () => {
it('should handle multiple images with a mix of data and relative URLs', () => { it('should handle multiple images with a mix of data and relative URLs', () => {
const documents: Document[] = [{ const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).' content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).',
markdown: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).'
}]; }];
const expectedDocuments: Document[] = [{ const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' }, metadata: { sourceURL: 'https://example.com' },
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).' content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).',
markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).'
}]; }];
const result = replaceImgPathsWithAbsolutePaths(documents); const result = replaceImgPathsWithAbsolutePaths(documents);

View File

@ -10,6 +10,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
) || []; ) || [];
paths.forEach((path: string) => { paths.forEach((path: string) => {
try {
const isImage = path.startsWith("!"); const isImage = path.startsWith("!");
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/); let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
let url = matchedUrl[1]; let url = matchedUrl[1];
@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
} }
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
if (isImage) { // Image is handled afterwards
document.content = document.content.replace( if (!isImage) {
path,
`${markdownLinkOrImageText}(${url})`
);
} else {
document.content = document.content.replace( document.content = document.content.replace(
path, path,
`${markdownLinkOrImageText}(${url})` `${markdownLinkOrImageText}(${url})`
); );
}
} catch (error) {
} }
}); });
document.markdown = document.content;
}); });
return documents; return documents;
@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
if (!imageUrl.startsWith("http")) { if (!imageUrl.startsWith("http")) {
if (imageUrl.startsWith("/")) { if (imageUrl.startsWith("/")) {
imageUrl = imageUrl.substring(1); imageUrl = imageUrl.substring(1);
}
imageUrl = new URL(imageUrl, baseUrl).toString(); imageUrl = new URL(imageUrl, baseUrl).toString();
} else {
imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
}
} }
} }
@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
`![${altText}](${imageUrl})` `![${altText}](${imageUrl})`
); );
}); });
document.markdown = document.content;
}); });
return documents; return documents;

View File

@ -38,7 +38,7 @@ getWebScraperQueue().process(
error: message /* etc... */, error: message /* etc... */,
}; };
await callWebhook(job.data.team_id, data); await callWebhook(job.data.team_id, job.id as string, data);
await logJob({ await logJob({
success: success, success: success,
@ -78,7 +78,7 @@ getWebScraperQueue().process(
error: error:
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */, "Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
}; };
await callWebhook(job.data.team_id, data); await callWebhook(job.data.team_id, job.id as string, data);
await logJob({ await logJob({
success: false, success: false,
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"), message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),

View File

@ -1,8 +1,35 @@
import Redis from 'ioredis'; import Redis from "ioredis";
// Initialize Redis client // Initialize Redis client
const redis = new Redis(process.env.REDIS_URL); const redis = new Redis(process.env.REDIS_URL);
// Listen to 'error' events to the Redis connection
redis.on("error", (error) => {
try {
if (error.message === "ECONNRESET") {
console.log("Connection to Redis Session Store timed out.");
} else if (error.message === "ECONNREFUSED") {
console.log("Connection to Redis Session Store refused!");
} else console.log(error);
} catch (error) {}
});
// Listen to 'reconnecting' event to Redis
redis.on("reconnecting", (err) => {
try {
if (redis.status === "reconnecting")
console.log("Reconnecting to Redis Session Store...");
else console.log("Error reconnecting to Redis Session Store.");
} catch (error) {}
});
// Listen to the 'connect' event to Redis
redis.on("connect", (err) => {
try {
if (!err) console.log("Connected to Redis Session Store!");
} catch (error) {}
});
/** /**
* Set a value in Redis with an optional expiration time. * Set a value in Redis with an optional expiration time.
* @param {string} key The key under which to store the value. * @param {string} key The key under which to store the value.
@ -11,7 +38,7 @@ const redis = new Redis(process.env.REDIS_URL);
*/ */
const setValue = async (key: string, value: string, expire?: number) => { const setValue = async (key: string, value: string, expire?: number) => {
if (expire) { if (expire) {
await redis.set(key, value, 'EX', expire); await redis.set(key, value, "EX", expire);
} else { } else {
await redis.set(key, value); await redis.set(key, value);
} }

View File

@ -1,17 +1,19 @@
import { supabase_service } from "./supabase"; import { supabase_service } from "./supabase";
export const callWebhook = async (teamId: string, data: any) => { export const callWebhook = async (teamId: string, jobId: string,data: any) => {
try { try {
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL; const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
let webhookUrl = selfHostedUrl; let webhookUrl = selfHostedUrl;
if (!selfHostedUrl) { // Only fetch the webhook URL from the database if the self-hosted webhook URL is not set
// and the USE_DB_AUTHENTICATION environment variable is set to true
if (!selfHostedUrl && useDbAuthentication) {
const { data: webhooksData, error } = await supabase_service const { data: webhooksData, error } = await supabase_service
.from("webhooks") .from("webhooks")
.select("url") .select("url")
.eq("team_id", teamId) .eq("team_id", teamId)
.limit(1); .limit(1);
if (error) { if (error) {
console.error( console.error(
`Error fetching webhook URL for team ID: ${teamId}`, `Error fetching webhook URL for team ID: ${teamId}`,
@ -45,6 +47,7 @@ export const callWebhook = async (teamId: string, data: any) => {
}, },
body: JSON.stringify({ body: JSON.stringify({
success: data.success, success: data.success,
jobId: jobId,
data: dataToSend, data: dataToSend,
error: data.error || undefined, error: data.error || undefined,
}), }),

View File

@ -1,3 +1,57 @@
"""
This is the Firecrawl package.
This package provides a Python SDK for interacting with the Firecrawl API.
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
and check the status of these jobs.
For more information visit https://github.com/firecrawl/
"""
import logging
import os
from .firecrawl import FirecrawlApp from .firecrawl import FirecrawlApp
__version__ = "0.0.14" __version__ = "0.0.15"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")
def _basic_config() -> None:
"""Set up basic configuration for logging with a specific format and date format."""
try:
logging.basicConfig(
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
except Exception as e:
logger.error("Failed to configure logging: %s", e)
def setup_logging() -> None:
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
env = os.environ.get(
"FIRECRAWL_LOGGING_LEVEL", "INFO"
).upper() # Default to 'INFO' level
_basic_config()
if env == "DEBUG":
logger.setLevel(logging.DEBUG)
elif env == "INFO":
logger.setLevel(logging.INFO)
elif env == "WARNING":
logger.setLevel(logging.WARNING)
elif env == "ERROR":
logger.setLevel(logging.ERROR)
elif env == "CRITICAL":
logger.setLevel(logging.CRITICAL)
else:
logger.setLevel(logging.INFO)
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
# Initialize logging configuration when the module is imported
setup_logging()
logger.debug("Debugging logger setup")

View File

@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes.
Classes: Classes:
- FirecrawlApp: Main class for interacting with the Firecrawl API. - FirecrawlApp: Main class for interacting with the Firecrawl API.
""" """
import logging
import os import os
import time import time
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
import requests import requests
logger : logging.Logger = logging.getLogger("firecrawl")
class FirecrawlApp: class FirecrawlApp:
""" """
@ -28,8 +29,14 @@ class FirecrawlApp:
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
if self.api_key is None: if self.api_key is None:
logger.warning("No API key provided")
raise ValueError('No API key provided') raise ValueError('No API key provided')
else:
logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
if self.api_url != 'https://api.firecrawl.dev':
logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
""" """

View File

Before

Width:  |  Height:  |  Size: 7.8 KiB

After

Width:  |  Height:  |  Size: 7.8 KiB

View File

Before

Width:  |  Height:  |  Size: 23 KiB

After

Width:  |  Height:  |  Size: 23 KiB

View File

Before

Width:  |  Height:  |  Size: 7.0 KiB

After

Width:  |  Height:  |  Size: 7.0 KiB

View File

Before

Width:  |  Height:  |  Size: 444 KiB

After

Width:  |  Height:  |  Size: 444 KiB

View File

Before

Width:  |  Height:  |  Size: 492 B

After

Width:  |  Height:  |  Size: 492 B

View File

Before

Width:  |  Height:  |  Size: 997 B

After

Width:  |  Height:  |  Size: 997 B

View File

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

View File

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

Before

Width:  |  Height:  |  Size: 262 KiB

After

Width:  |  Height:  |  Size: 262 KiB

View File

Before

Width:  |  Height:  |  Size: 629 B

After

Width:  |  Height:  |  Size: 629 B

View File

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

View File

@ -1,4 +1,4 @@
import OpenAI from "openai"; import OpenAI from "openai/index.mjs";
import { encoding_for_model } from "@dqbd/tiktoken"; import { encoding_for_model } from "@dqbd/tiktoken";
/** /**