Merge branch 'main' into py-sdk-improve-response-handling
20
.github/workflows/clean-before-24h-complete-jobs.yml
vendored
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
name: Clean Before 24h Completed Jobs
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * *'
|
||||||
|
|
||||||
|
env:
|
||||||
|
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
clean-jobs:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Send GET request to clean jobs
|
||||||
|
run: |
|
||||||
|
response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs)
|
||||||
|
if [ "$response" -ne 200 ]; then
|
||||||
|
echo "Failed to clean jobs. Response: $response"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Successfully cleaned jobs. Response: $response"
|
4
.github/workflows/fly.yml
vendored
|
@ -183,6 +183,7 @@ jobs:
|
||||||
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
|
||||||
|
|
||||||
build-and-publish-python-sdk:
|
build-and-publish-python-sdk:
|
||||||
|
name: Build and publish Python SDK
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: deploy
|
needs: deploy
|
||||||
|
|
||||||
|
@ -213,7 +214,7 @@ jobs:
|
||||||
working-directory: ./apps/python-sdk
|
working-directory: ./apps/python-sdk
|
||||||
|
|
||||||
- name: Publish to PyPI
|
- name: Publish to PyPI
|
||||||
if: ${{ env.VERSION_INCREMENTED == 'true' }}
|
if: ${{ env.PYTHON_SDK_VERSION_INCREMENTED == 'true' }}
|
||||||
env:
|
env:
|
||||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||||
|
@ -222,6 +223,7 @@ jobs:
|
||||||
working-directory: ./apps/python-sdk
|
working-directory: ./apps/python-sdk
|
||||||
|
|
||||||
build-and-publish-js-sdk:
|
build-and-publish-js-sdk:
|
||||||
|
name: Build and publish JavaScript SDK
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: deploy
|
needs: deploy
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,7 @@ kill_timeout = '5s'
|
||||||
soft_limit = 12
|
soft_limit = 12
|
||||||
|
|
||||||
[[vm]]
|
[[vm]]
|
||||||
size = 'performance-8x'
|
size = 'performance-4x'
|
||||||
processes = ['app']
|
processes = ['app']
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -51,10 +51,26 @@
|
||||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||||
"default": false
|
"default": false
|
||||||
},
|
},
|
||||||
|
"screenshot": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
"waitFor": {
|
"waitFor": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
"default": 0
|
"default": 0
|
||||||
|
},
|
||||||
|
"removeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
|
"headers": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -176,10 +192,20 @@
|
||||||
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
||||||
"default": "default"
|
"default": "default"
|
||||||
},
|
},
|
||||||
|
"ignoreSitemap": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Ignore the website sitemap when crawling",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
"limit": {
|
"limit": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Maximum number of pages to crawl",
|
"description": "Maximum number of pages to crawl",
|
||||||
"default": 10000
|
"default": 10000
|
||||||
|
},
|
||||||
|
"allowBackwardCrawling": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
|
||||||
|
"default": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -195,6 +221,27 @@
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||||
"default": false
|
"default": false
|
||||||
|
},
|
||||||
|
"screenshot": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"headers": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
||||||
|
},
|
||||||
|
"removeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
|
"replaceAllPathsWithAbsolutePaths": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Replace all relative paths with absolute paths for images and links",
|
||||||
|
"default": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -368,7 +415,7 @@
|
||||||
"items": {
|
"items": {
|
||||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||||
},
|
},
|
||||||
"description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
|
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -513,6 +560,10 @@
|
||||||
"nullable": true,
|
"nullable": true,
|
||||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||||
},
|
},
|
||||||
|
"index": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||||
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
import { app } from "../../index";
|
|
||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
const path = require("path");
|
const path = require("path");
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
import { app } from "../../index";
|
|
||||||
import dotenv from "dotenv";
|
import dotenv from "dotenv";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
|
|
||||||
|
@ -35,7 +34,7 @@ describe("E2E Tests for API Routes", () => {
|
||||||
|
|
||||||
describe("POST /v0/scrape", () => {
|
describe("POST /v0/scrape", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response = await request(app).post("/v0/scrape");
|
const response = await request(TEST_URL).post("/v0/scrape");
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -136,6 +135,40 @@ describe("E2E Tests for API Routes", () => {
|
||||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||||
|
const responseWithoutRemoveTags = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://www.scrapethissite.com/" });
|
||||||
|
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||||
|
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toHaveProperty("content");
|
||||||
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
|
expect(response.body.data.content).toContain("Scrape This Site");
|
||||||
|
expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
|
||||||
|
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
|
||||||
|
expect(response.body.data.content).not.toContain("web scraping"); // strong
|
||||||
|
}, 30000); // 30 seconds timeout
|
||||||
|
|
||||||
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
||||||
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
||||||
// const startTime = Date.now();
|
// const startTime = Date.now();
|
||||||
|
@ -596,7 +629,7 @@ describe("E2E Tests for API Routes", () => {
|
||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://roastmywebsite.ai" });
|
.send({ url: "https://mendable.ai/blog" });
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
let isCompleted = false;
|
let isCompleted = false;
|
||||||
|
@ -622,7 +655,13 @@ describe("E2E Tests for API Routes", () => {
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("_Roast_");
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
|
|
||||||
|
const childrenLinks = completedResponse.body.data.filter(doc =>
|
||||||
|
doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||||
}, 120000); // 120 seconds
|
}, 120000); // 120 seconds
|
||||||
|
|
||||||
it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
|
it.concurrent('should return a successful response for a valid crawl job with PDF files without explicit .pdf extension', async () => {
|
||||||
|
@ -757,34 +796,82 @@ describe("E2E Tests for API Routes", () => {
|
||||||
}, 60000);
|
}, 60000);
|
||||||
}); // 60 seconds
|
}); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response for a valid crawl job with allowBackwardCrawling set to true option", async () => {
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://mendable.ai/blog",
|
||||||
|
pageOptions: { includeHtml: true },
|
||||||
|
crawlerOptions: { allowBackwardCrawling: true },
|
||||||
|
});
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
let isFinished = false;
|
||||||
|
let completedResponse;
|
||||||
|
|
||||||
|
while (!isFinished) {
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("status");
|
||||||
|
|
||||||
|
if (response.body.status === "completed") {
|
||||||
|
isFinished = true;
|
||||||
|
completedResponse = response;
|
||||||
|
} else {
|
||||||
|
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
|
expect(completedResponse.body.status).toBe("completed");
|
||||||
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
|
expect(completedResponse.body.data[0]).toHaveProperty("html");
|
||||||
|
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
||||||
|
expect(completedResponse.body.data[0].markdown).toContain("Mendable");
|
||||||
|
|
||||||
|
const onlyChildrenLinks = completedResponse.body.data.filter(doc => {
|
||||||
|
return doc.metadata && doc.metadata.sourceURL && doc.metadata.sourceURL.includes("mendable.ai/blog")
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(completedResponse.body.data.length).toBeGreaterThan(onlyChildrenLinks.length);
|
||||||
|
}, 60000);
|
||||||
|
|
||||||
it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
|
it.concurrent("If someone cancels a crawl job, it should turn into failed status", async () => {
|
||||||
const crawlResponse = await request(TEST_URL)
|
const crawlResponse = await request(TEST_URL)
|
||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://jestjs.io" });
|
.send({ url: "https://jestjs.io" });
|
||||||
|
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
// wait for 30 seconds
|
|
||||||
await new Promise((r) => setTimeout(r, 20000));
|
await new Promise((r) => setTimeout(r, 20000));
|
||||||
|
|
||||||
const response = await request(TEST_URL)
|
const responseCancel = await request(TEST_URL)
|
||||||
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
|
.delete(`/v0/crawl/cancel/${crawlResponse.body.jobId}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
expect(response.statusCode).toBe(200);
|
expect(responseCancel.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("status");
|
expect(responseCancel.body).toHaveProperty("status");
|
||||||
expect(response.body.status).toBe("cancelled");
|
expect(responseCancel.body.status).toBe("cancelled");
|
||||||
|
|
||||||
await new Promise((r) => setTimeout(r, 10000));
|
await new Promise((r) => setTimeout(r, 10000));
|
||||||
|
|
||||||
const completedResponse = await request(TEST_URL)
|
const completedResponse = await request(TEST_URL)
|
||||||
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
expect(completedResponse.statusCode).toBe(200);
|
expect(completedResponse.statusCode).toBe(200);
|
||||||
expect(completedResponse.body).toHaveProperty("status");
|
expect(completedResponse.body).toHaveProperty("status");
|
||||||
expect(completedResponse.body.status).toBe("failed");
|
expect(completedResponse.body.status).toBe("failed");
|
||||||
expect(completedResponse.body).toHaveProperty("data");
|
expect(completedResponse.body).toHaveProperty("data");
|
||||||
expect(completedResponse.body.data).toEqual(null);
|
expect(completedResponse.body.data).toBeNull();
|
||||||
expect(completedResponse.body).toHaveProperty("partial_data");
|
expect(completedResponse.body).toHaveProperty("partial_data");
|
||||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
|
expect(completedResponse.body.partial_data[0]).toHaveProperty("content");
|
||||||
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.partial_data[0]).toHaveProperty("markdown");
|
||||||
|
|
|
@ -143,7 +143,7 @@ export async function supaAuthenticateUser(
|
||||||
const startDate = new Date();
|
const startDate = new Date();
|
||||||
const endDate = new Date();
|
const endDate = new Date();
|
||||||
endDate.setDate(endDate.getDate() + 7);
|
endDate.setDate(endDate.getDate() + 7);
|
||||||
await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
|
// await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
||||||
|
|
|
@ -55,8 +55,14 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
allowBackwardCrawling: false
|
||||||
|
};
|
||||||
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
removeTags: []
|
||||||
|
};
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) {
|
||||||
try {
|
try {
|
||||||
|
@ -64,9 +70,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
await a.setOptions({
|
await a.setOptions({
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
urls: [url],
|
urls: [url],
|
||||||
crawlerOptions: {
|
crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||||
returnOnlyUrls: true,
|
|
||||||
},
|
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -91,7 +95,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
const job = await addWebScraperJob({
|
const job = await addWebScraperJob({
|
||||||
url: url,
|
url: url,
|
||||||
mode: mode ?? "crawl", // fix for single urls not working
|
mode: mode ?? "crawl", // fix for single urls not working
|
||||||
crawlerOptions: { ...crawlerOptions },
|
crawlerOptions: crawlerOptions,
|
||||||
team_id: team_id,
|
team_id: team_id,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
origin: req.body.origin ?? "api",
|
origin: req.body.origin ?? "api",
|
||||||
|
|
|
@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||||
|
|
||||||
const job = await addWebScraperJob({
|
const job = await addWebScraperJob({
|
||||||
url: url,
|
url: url,
|
||||||
|
|
|
@ -85,6 +85,7 @@ export async function searchHelper(
|
||||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||||
includeHtml: pageOptions?.includeHtml ?? false,
|
includeHtml: pageOptions?.includeHtml ?? false,
|
||||||
|
removeTags: pageOptions?.removeTags ?? [],
|
||||||
fallback: false,
|
fallback: false,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
|
||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
fetchPageContent: true,
|
fetchPageContent: true,
|
||||||
|
removeTags: [],
|
||||||
fallback: false,
|
fallback: false,
|
||||||
};
|
};
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? "api";
|
||||||
|
|
|
@ -5,13 +5,32 @@ import "dotenv/config";
|
||||||
import { getWebScraperQueue } from "./services/queue-service";
|
import { getWebScraperQueue } from "./services/queue-service";
|
||||||
import { redisClient } from "./services/rate-limiter";
|
import { redisClient } from "./services/rate-limiter";
|
||||||
import { v0Router } from "./routes/v0";
|
import { v0Router } from "./routes/v0";
|
||||||
import { initSDK } from '@hyperdx/node-opentelemetry';
|
import { initSDK } from "@hyperdx/node-opentelemetry";
|
||||||
|
import cluster from "cluster";
|
||||||
|
import os from "os";
|
||||||
|
|
||||||
const { createBullBoard } = require("@bull-board/api");
|
const { createBullBoard } = require("@bull-board/api");
|
||||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||||
const { ExpressAdapter } = require("@bull-board/express");
|
const { ExpressAdapter } = require("@bull-board/express");
|
||||||
|
|
||||||
export const app = express();
|
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
||||||
|
console.log(`Number of CPUs: ${numCPUs} available`);
|
||||||
|
|
||||||
|
if (cluster.isMaster) {
|
||||||
|
console.log(`Master ${process.pid} is running`);
|
||||||
|
|
||||||
|
// Fork workers.
|
||||||
|
for (let i = 0; i < numCPUs; i++) {
|
||||||
|
cluster.fork();
|
||||||
|
}
|
||||||
|
|
||||||
|
cluster.on("exit", (worker, code, signal) => {
|
||||||
|
console.log(`Worker ${worker.process.pid} exited`);
|
||||||
|
console.log("Starting a new worker");
|
||||||
|
cluster.fork();
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const app = express();
|
||||||
|
|
||||||
global.isProduction = process.env.IS_PRODUCTION === "true";
|
global.isProduction = process.env.IS_PRODUCTION === "true";
|
||||||
|
|
||||||
|
@ -50,14 +69,13 @@ const HOST = process.env.HOST ?? "localhost";
|
||||||
redisClient.connect();
|
redisClient.connect();
|
||||||
|
|
||||||
// HyperDX OpenTelemetry
|
// HyperDX OpenTelemetry
|
||||||
if(process.env.ENV === 'production') {
|
if (process.env.ENV === "production") {
|
||||||
initSDK({ consoleCapture: true, additionalInstrumentations: [] });
|
initSDK({ consoleCapture: true, additionalInstrumentations: [] });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function startServer(port = DEFAULT_PORT) {
|
||||||
export function startServer(port = DEFAULT_PORT) {
|
|
||||||
const server = app.listen(Number(port), HOST, () => {
|
const server = app.listen(Number(port), HOST, () => {
|
||||||
console.log(`Server listening on port ${port}`);
|
console.log(`Worker ${process.pid} listening on port ${port}`);
|
||||||
console.log(
|
console.log(
|
||||||
`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
`For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
|
||||||
);
|
);
|
||||||
|
@ -112,7 +130,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
app.get('/serverHealthCheck/notify', async (req, res) => {
|
app.get("/serverHealthCheck/notify", async (req, res) => {
|
||||||
if (process.env.SLACK_WEBHOOK_URL) {
|
if (process.env.SLACK_WEBHOOK_URL) {
|
||||||
const treshold = 1; // The treshold value for the active jobs
|
const treshold = 1; // The treshold value for the active jobs
|
||||||
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
|
const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds
|
||||||
|
@ -138,19 +156,21 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
|
||||||
if (waitingJobsCount >= treshold) {
|
if (waitingJobsCount >= treshold) {
|
||||||
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
|
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL;
|
||||||
const message = {
|
const message = {
|
||||||
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`,
|
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
|
||||||
|
timeout / 60000
|
||||||
|
} minute(s).`,
|
||||||
};
|
};
|
||||||
|
|
||||||
const response = await fetch(slackWebhookUrl, {
|
const response = await fetch(slackWebhookUrl, {
|
||||||
method: 'POST',
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'application/json',
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: JSON.stringify(message),
|
body: JSON.stringify(message),
|
||||||
})
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
console.error('Failed to send Slack notification')
|
console.error("Failed to send Slack notification");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}, timeout);
|
}, timeout);
|
||||||
|
@ -164,10 +184,36 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
app.get(
|
||||||
|
`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`,
|
||||||
|
async (req, res) => {
|
||||||
|
try {
|
||||||
|
const webScraperQueue = getWebScraperQueue();
|
||||||
|
const completedJobs = await webScraperQueue.getJobs(["completed"]);
|
||||||
|
const before24hJobs = completedJobs.filter(
|
||||||
|
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
|
||||||
|
);
|
||||||
|
const jobIds = before24hJobs.map((job) => job.id) as string[];
|
||||||
|
let count = 0;
|
||||||
|
for (const jobId of jobIds) {
|
||||||
|
try {
|
||||||
|
await webScraperQueue.removeJobs(jobId);
|
||||||
|
count++;
|
||||||
|
} catch (jobError) {
|
||||||
|
console.error(`Failed to remove job with ID ${jobId}:`, jobError);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res.status(200).send(`Removed ${count} completed jobs.`);
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Failed to clean last 24h complete jobs:", error);
|
||||||
|
res.status(500).send("Failed to clean jobs");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
app.get("/is-production", (req, res) => {
|
app.get("/is-production", (req, res) => {
|
||||||
res.send({ isProduction: global.isProduction });
|
res.send({ isProduction: global.isProduction });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
console.log(`Worker ${process.pid} started`);
|
||||||
// /workers health check, cant act as load balancer, just has to be a pre deploy thing
|
}
|
||||||
|
|
|
@ -18,6 +18,8 @@ export type PageOptions = {
|
||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
|
removeTags?: string | string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
@ -35,10 +37,7 @@ export type SearchOptions = {
|
||||||
location?: string;
|
location?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type WebScraperOptions = {
|
export type CrawlerOptions = {
|
||||||
urls: string[];
|
|
||||||
mode: "single_urls" | "sitemap" | "crawl";
|
|
||||||
crawlerOptions?: {
|
|
||||||
returnOnlyUrls?: boolean;
|
returnOnlyUrls?: boolean;
|
||||||
includes?: string[];
|
includes?: string[];
|
||||||
excludes?: string[];
|
excludes?: string[];
|
||||||
|
@ -47,8 +46,15 @@ export type WebScraperOptions = {
|
||||||
limit?: number;
|
limit?: number;
|
||||||
generateImgAltText?: boolean;
|
generateImgAltText?: boolean;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
|
ignoreSitemap?: boolean;
|
||||||
mode?: "default" | "fast"; // have a mode of some sort
|
mode?: "default" | "fast"; // have a mode of some sort
|
||||||
};
|
allowBackwardCrawling?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export type WebScraperOptions = {
|
||||||
|
urls: string[];
|
||||||
|
mode: "single_urls" | "sitemap" | "crawl";
|
||||||
|
crawlerOptions?: CrawlerOptions;
|
||||||
pageOptions?: PageOptions;
|
pageOptions?: PageOptions;
|
||||||
extractorOptions?: ExtractorOptions;
|
extractorOptions?: ExtractorOptions;
|
||||||
concurrentRequests?: number;
|
concurrentRequests?: number;
|
||||||
|
|
|
@ -3,7 +3,7 @@ import cheerio, { load } from "cheerio";
|
||||||
import { URL } from "url";
|
import { URL } from "url";
|
||||||
import { getLinksFromSitemap } from "./sitemap";
|
import { getLinksFromSitemap } from "./sitemap";
|
||||||
import async from "async";
|
import async from "async";
|
||||||
import { Progress } from "../../lib/entities";
|
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
|
||||||
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ export class WebCrawler {
|
||||||
private robotsTxtUrl: string;
|
private robotsTxtUrl: string;
|
||||||
private robots: any;
|
private robots: any;
|
||||||
private generateImgAltText: boolean;
|
private generateImgAltText: boolean;
|
||||||
|
private allowBackwardCrawling: boolean;
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
initialUrl,
|
initialUrl,
|
||||||
|
@ -29,6 +30,7 @@ export class WebCrawler {
|
||||||
limit = 10000,
|
limit = 10000,
|
||||||
generateImgAltText = false,
|
generateImgAltText = false,
|
||||||
maxCrawledDepth = 10,
|
maxCrawledDepth = 10,
|
||||||
|
allowBackwardCrawling = false
|
||||||
}: {
|
}: {
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
includes?: string[];
|
includes?: string[];
|
||||||
|
@ -37,6 +39,7 @@ export class WebCrawler {
|
||||||
limit?: number;
|
limit?: number;
|
||||||
generateImgAltText?: boolean;
|
generateImgAltText?: boolean;
|
||||||
maxCrawledDepth?: number;
|
maxCrawledDepth?: number;
|
||||||
|
allowBackwardCrawling?: boolean;
|
||||||
}) {
|
}) {
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
this.baseUrl = new URL(initialUrl).origin;
|
this.baseUrl = new URL(initialUrl).origin;
|
||||||
|
@ -49,6 +52,7 @@ export class WebCrawler {
|
||||||
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
||||||
this.maxCrawledDepth = maxCrawledDepth ?? 10;
|
this.maxCrawledDepth = maxCrawledDepth ?? 10;
|
||||||
this.generateImgAltText = generateImgAltText ?? false;
|
this.generateImgAltText = generateImgAltText ?? false;
|
||||||
|
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||||
|
@ -90,10 +94,16 @@ export class WebCrawler {
|
||||||
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
|
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
|
||||||
|
|
||||||
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
|
||||||
if (linkHostname !== initialHostname || !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
|
if (linkHostname !== initialHostname) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!this.allowBackwardCrawling) {
|
||||||
|
if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||||
// Check if the link is disallowed by robots.txt
|
// Check if the link is disallowed by robots.txt
|
||||||
if (!isAllowed) {
|
if (!isAllowed) {
|
||||||
|
@ -108,6 +118,8 @@ export class WebCrawler {
|
||||||
|
|
||||||
public async start(
|
public async start(
|
||||||
inProgress?: (progress: Progress) => void,
|
inProgress?: (progress: Progress) => void,
|
||||||
|
pageOptions?: PageOptions,
|
||||||
|
crawlerOptions?: CrawlerOptions,
|
||||||
concurrencyLimit: number = 5,
|
concurrencyLimit: number = 5,
|
||||||
limit: number = 10000,
|
limit: number = 10000,
|
||||||
maxDepth: number = 10
|
maxDepth: number = 10
|
||||||
|
@ -122,17 +134,21 @@ export class WebCrawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if(!crawlerOptions?.ignoreSitemap){
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const urls = await this.crawlUrls(
|
const urls = await this.crawlUrls(
|
||||||
[this.initialUrl],
|
[this.initialUrl],
|
||||||
|
pageOptions,
|
||||||
concurrencyLimit,
|
concurrencyLimit,
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
urls.length === 0 &&
|
urls.length === 0 &&
|
||||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||||
|
@ -140,14 +156,15 @@ export class WebCrawler {
|
||||||
return [{ url: this.initialUrl, html: "" }];
|
return [{ url: this.initialUrl, html: "" }];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// make sure to run include exclude here again
|
// make sure to run include exclude here again
|
||||||
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
||||||
|
|
||||||
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
||||||
}
|
}
|
||||||
|
|
||||||
private async crawlUrls(
|
private async crawlUrls(
|
||||||
urls: string[],
|
urls: string[],
|
||||||
|
pageOptions: PageOptions,
|
||||||
concurrencyLimit: number,
|
concurrencyLimit: number,
|
||||||
inProgress?: (progress: Progress) => void,
|
inProgress?: (progress: Progress) => void,
|
||||||
): Promise<{ url: string, html: string }[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
|
@ -158,7 +175,7 @@ export class WebCrawler {
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const newUrls = await this.crawl(task);
|
const newUrls = await this.crawl(task, pageOptions);
|
||||||
// add the initial url if not already added
|
// add the initial url if not already added
|
||||||
// if (this.visited.size === 1) {
|
// if (this.visited.size === 1) {
|
||||||
// let normalizedInitial = this.initialUrl;
|
// let normalizedInitial = this.initialUrl;
|
||||||
|
@ -188,7 +205,7 @@ export class WebCrawler {
|
||||||
currentDocumentUrl: task,
|
currentDocumentUrl: task,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
|
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
|
||||||
if (callback && typeof callback === "function") {
|
if (callback && typeof callback === "function") {
|
||||||
callback();
|
callback();
|
||||||
}
|
}
|
||||||
|
@ -207,20 +224,18 @@ export class WebCrawler {
|
||||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string}[]> {
|
||||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")){
|
const normalizedUrl = this.normalizeCrawlUrl(url);
|
||||||
|
if (this.visited.has(normalizedUrl) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
this.visited.add(url);
|
this.visited.add(normalizedUrl);
|
||||||
|
|
||||||
|
|
||||||
if (!url.startsWith("http")) {
|
if (!url.startsWith("http")) {
|
||||||
url = "https://" + url;
|
url = "https://" + url;
|
||||||
|
|
||||||
}
|
}
|
||||||
if (url.endsWith("/")) {
|
if (url.endsWith("/")) {
|
||||||
url = url.slice(0, -1);
|
url = url.slice(0, -1);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
|
||||||
|
@ -231,8 +246,8 @@ export class WebCrawler {
|
||||||
let content: string = "";
|
let content: string = "";
|
||||||
// If it is the first link, fetch with single url
|
// If it is the first link, fetch with single url
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
const page = await scrapSingleUrl(url, {includeHtml: true});
|
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
||||||
content = page.html ?? ""
|
content = page.html ?? "";
|
||||||
} else {
|
} else {
|
||||||
const response = await axios.get(url);
|
const response = await axios.get(url);
|
||||||
content = response.data ?? "";
|
content = response.data ?? "";
|
||||||
|
@ -241,12 +256,10 @@ export class WebCrawler {
|
||||||
let links: { url: string, html: string }[] = [];
|
let links: { url: string, html: string }[] = [];
|
||||||
|
|
||||||
// Add the initial URL to the list of links
|
// Add the initial URL to the list of links
|
||||||
if(this.visited.size === 1)
|
if (this.visited.size === 1) {
|
||||||
{
|
|
||||||
links.push({ url, html: content });
|
links.push({ url, html: content });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
$("a").each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
const href = $(element).attr("href");
|
const href = $(element).attr("href");
|
||||||
if (href) {
|
if (href) {
|
||||||
|
@ -254,14 +267,15 @@ export class WebCrawler {
|
||||||
if (!href.startsWith("http")) {
|
if (!href.startsWith("http")) {
|
||||||
fullUrl = new URL(href, this.baseUrl).toString();
|
fullUrl = new URL(href, this.baseUrl).toString();
|
||||||
}
|
}
|
||||||
const url = new URL(fullUrl);
|
const urlObj = new URL(fullUrl);
|
||||||
const path = url.pathname;
|
const path = urlObj.pathname;
|
||||||
|
|
||||||
if (
|
if (
|
||||||
this.isInternalLink(fullUrl) &&
|
this.isInternalLink(fullUrl) &&
|
||||||
this.matchesPattern(fullUrl) &&
|
this.matchesPattern(fullUrl) &&
|
||||||
this.noSections(fullUrl) &&
|
this.noSections(fullUrl) &&
|
||||||
this.matchesIncludes(path) &&
|
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
|
||||||
|
// this.matchesIncludes(path) &&
|
||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
||||||
) {
|
) {
|
||||||
|
@ -274,12 +288,22 @@ export class WebCrawler {
|
||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
// Create a new list to return to avoid modifying the visited list
|
// Create a new list to return to avoid modifying the visited list
|
||||||
return links.filter((link) => !this.visited.has(link.url));
|
return links.filter((link) => !this.visited.has(this.normalizeCrawlUrl(link.url)));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private normalizeCrawlUrl(url: string): string {
|
||||||
|
try{
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
urlObj.searchParams.sort(); // Sort query parameters to normalize
|
||||||
|
return urlObj.toString();
|
||||||
|
} catch (error) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private matchesIncludes(url: string): boolean {
|
private matchesIncludes(url: string): boolean {
|
||||||
if (this.includes.length === 0 || this.includes[0] == "") return true;
|
if (this.includes.length === 0 || this.includes[0] == "") return true;
|
||||||
return this.includes.some((pattern) => new RegExp(pattern).test(url));
|
return this.includes.some((pattern) => new RegExp(pattern).test(url));
|
||||||
|
@ -388,7 +412,6 @@ export class WebCrawler {
|
||||||
|
|
||||||
// Normalize and check if the URL is present in any of the sitemaps
|
// Normalize and check if the URL is present in any of the sitemaps
|
||||||
const normalizedUrl = normalizeUrl(url);
|
const normalizedUrl = normalizeUrl(url);
|
||||||
|
|
||||||
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
||||||
|
|
||||||
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
||||||
|
|
|
@ -31,12 +31,14 @@ export class WebScraperDataProvider {
|
||||||
private limit: number = 10000;
|
private limit: number = 10000;
|
||||||
private concurrentRequests: number = 20;
|
private concurrentRequests: number = 20;
|
||||||
private generateImgAltText: boolean = false;
|
private generateImgAltText: boolean = false;
|
||||||
|
private ignoreSitemap: boolean = false;
|
||||||
private pageOptions?: PageOptions;
|
private pageOptions?: PageOptions;
|
||||||
private extractorOptions?: ExtractorOptions;
|
private extractorOptions?: ExtractorOptions;
|
||||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
||||||
"gpt-4-turbo";
|
"gpt-4-turbo";
|
||||||
private crawlerMode: string = "default";
|
private crawlerMode: string = "default";
|
||||||
|
private allowBackwardCrawling: boolean = false;
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
|
@ -169,10 +171,15 @@ export class WebScraperDataProvider {
|
||||||
maxCrawledDepth: this.maxCrawledDepth,
|
maxCrawledDepth: this.maxCrawledDepth,
|
||||||
limit: this.limit,
|
limit: this.limit,
|
||||||
generateImgAltText: this.generateImgAltText,
|
generateImgAltText: this.generateImgAltText,
|
||||||
|
allowBackwardCrawling: this.allowBackwardCrawling,
|
||||||
});
|
});
|
||||||
|
|
||||||
let links = await crawler.start(
|
let links = await crawler.start(
|
||||||
inProgress,
|
inProgress,
|
||||||
|
this.pageOptions,
|
||||||
|
{
|
||||||
|
ignoreSitemap: this.ignoreSitemap,
|
||||||
|
},
|
||||||
5,
|
5,
|
||||||
this.limit,
|
this.limit,
|
||||||
this.maxCrawledDepth
|
this.maxCrawledDepth
|
||||||
|
@ -296,9 +303,10 @@ export class WebScraperDataProvider {
|
||||||
}
|
}
|
||||||
|
|
||||||
private applyPathReplacements(documents: Document[]): Document[] {
|
private applyPathReplacements(documents: Document[]): Document[] {
|
||||||
return this.replaceAllPathsWithAbsolutePaths
|
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||||
? replacePathsWithAbsolutePaths(documents)
|
documents = replacePathsWithAbsolutePaths(documents);
|
||||||
: replaceImgPathsWithAbsolutePaths(documents);
|
}
|
||||||
|
return replaceImgPathsWithAbsolutePaths(documents);
|
||||||
}
|
}
|
||||||
|
|
||||||
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
||||||
|
@ -467,12 +475,19 @@ export class WebScraperDataProvider {
|
||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
this.pageOptions = options.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
replaceAllPathsWithAbsolutePaths: false,
|
||||||
|
removeTags: []
|
||||||
|
};
|
||||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||||
|
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||||
|
this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
|
||||||
|
|
||||||
// make sure all urls start with https://
|
// make sure all urls start with https://
|
||||||
this.urls = this.urls.map((url) => {
|
this.urls = this.urls.map((url) => {
|
||||||
|
|
|
@ -8,6 +8,7 @@ import { excludeNonMainTags } from "./utils/excludeTags";
|
||||||
import { urlSpecificParams } from "./utils/custom/website_params";
|
import { urlSpecificParams } from "./utils/custom/website_params";
|
||||||
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
|
||||||
import { handleCustomScraping } from "./custom/handleCustomScraping";
|
import { handleCustomScraping } from "./custom/handleCustomScraping";
|
||||||
|
import axios from "axios";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
|
@ -19,6 +20,8 @@ const baseScrapers = [
|
||||||
"fetch",
|
"fetch",
|
||||||
] as const;
|
] as const;
|
||||||
|
|
||||||
|
const universalTimeout = 15000;
|
||||||
|
|
||||||
export async function generateRequestParams(
|
export async function generateRequestParams(
|
||||||
url: string,
|
url: string,
|
||||||
wait_browser: string = "domcontentloaded",
|
wait_browser: string = "domcontentloaded",
|
||||||
|
@ -59,21 +62,24 @@ export async function scrapWithFireEngine(
|
||||||
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
|
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam}`
|
||||||
);
|
);
|
||||||
|
|
||||||
const response = await fetch(process.env.FIRE_ENGINE_BETA_URL + "/scrape", {
|
const response = await axios.post(
|
||||||
method: "POST",
|
process.env.FIRE_ENGINE_BETA_URL + "/scrape",
|
||||||
headers: {
|
{
|
||||||
"Content-Type": "application/json",
|
|
||||||
},
|
|
||||||
body: JSON.stringify({
|
|
||||||
url: url,
|
url: url,
|
||||||
wait: waitParam,
|
wait: waitParam,
|
||||||
screenshot: screenshotParam,
|
screenshot: screenshotParam,
|
||||||
headers: headers,
|
headers: headers,
|
||||||
pageOptions: pageOptions
|
pageOptions: pageOptions,
|
||||||
}),
|
},
|
||||||
});
|
{
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
timeout: universalTimeout + waitParam
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
if (!response.ok) {
|
if (response.status !== 200) {
|
||||||
console.error(
|
console.error(
|
||||||
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
|
@ -84,13 +90,17 @@ export async function scrapWithFireEngine(
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
return { html: await fetchAndProcessPdf(url), screenshot: "" };
|
||||||
} else {
|
} else {
|
||||||
const data = await response.json();
|
const data = response.data;
|
||||||
const html = data.content;
|
const html = data.content;
|
||||||
const screenshot = data.screenshot;
|
const screenshot = data.screenshot;
|
||||||
return { html: html ?? "", screenshot: screenshot ?? "" };
|
return { html: html ?? "", screenshot: screenshot ?? "" };
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
if (error.code === 'ECONNABORTED') {
|
||||||
|
console.log(`[Fire-Engine] Request timed out for ${url}`);
|
||||||
|
} else {
|
||||||
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
||||||
|
}
|
||||||
return { html: "", screenshot: "" };
|
return { html: "", screenshot: "" };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -98,7 +108,7 @@ export async function scrapWithFireEngine(
|
||||||
export async function scrapWithScrapingBee(
|
export async function scrapWithScrapingBee(
|
||||||
url: string,
|
url: string,
|
||||||
wait_browser: string = "domcontentloaded",
|
wait_browser: string = "domcontentloaded",
|
||||||
timeout: number = 15000
|
timeout: number = universalTimeout
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||||
|
@ -141,15 +151,19 @@ export async function scrapWithPlaywright(
|
||||||
// If the user has passed a wait parameter in the request, use that
|
// If the user has passed a wait parameter in the request, use that
|
||||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||||
|
|
||||||
const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
|
const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
|
||||||
method: "POST",
|
url: url,
|
||||||
|
wait_after_load: waitParam,
|
||||||
|
headers: headers,
|
||||||
|
}, {
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
body: JSON.stringify({ url: url, wait_after_load: waitParam, headers: headers }),
|
timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
|
||||||
|
transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (response.status !== 200) {
|
||||||
console.error(
|
console.error(
|
||||||
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
`[Playwright] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
|
@ -160,7 +174,7 @@ export async function scrapWithPlaywright(
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url);
|
||||||
} else {
|
} else {
|
||||||
const textData = await response.text();
|
const textData = response.data;
|
||||||
try {
|
try {
|
||||||
const data = JSON.parse(textData);
|
const data = JSON.parse(textData);
|
||||||
const html = data.content;
|
const html = data.content;
|
||||||
|
@ -171,17 +185,28 @@ export async function scrapWithPlaywright(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
if (error.code === 'ECONNABORTED') {
|
||||||
|
console.log(`[Playwright] Request timed out for ${url}`);
|
||||||
|
} else {
|
||||||
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
||||||
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapWithFetch(url: string): Promise<string> {
|
export async function scrapWithFetch(url: string): Promise<string> {
|
||||||
try {
|
try {
|
||||||
const response = await fetch(url);
|
const response = await axios.get(url, {
|
||||||
if (!response.ok) {
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
timeout: universalTimeout,
|
||||||
|
transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.status !== 200) {
|
||||||
console.error(
|
console.error(
|
||||||
`[Fetch] Error fetching url: ${url} with status: ${response.status}`
|
`[Axios] Error fetching url: ${url} with status: ${response.status}`
|
||||||
);
|
);
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
@ -190,11 +215,15 @@ export async function scrapWithFetch(url: string): Promise<string> {
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return fetchAndProcessPdf(url);
|
return fetchAndProcessPdf(url);
|
||||||
} else {
|
} else {
|
||||||
const text = await response.text();
|
const text = response.data;
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`[Fetch][c] Error fetching url: ${url} -> ${error}`);
|
if (error.code === 'ECONNABORTED') {
|
||||||
|
console.log(`[Axios] Request timed out for ${url}`);
|
||||||
|
} else {
|
||||||
|
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
|
||||||
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -275,6 +304,19 @@ export async function scrapSingleUrl(
|
||||||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||||
const soup = cheerio.load(html);
|
const soup = cheerio.load(html);
|
||||||
soup("script, style, iframe, noscript, meta, head").remove();
|
soup("script, style, iframe, noscript, meta, head").remove();
|
||||||
|
|
||||||
|
if (pageOptions.removeTags) {
|
||||||
|
if (typeof pageOptions.removeTags === 'string') {
|
||||||
|
pageOptions.removeTags.split(',').forEach((tag) => {
|
||||||
|
soup(tag.trim()).remove();
|
||||||
|
});
|
||||||
|
} else if (Array.isArray(pageOptions.removeTags)) {
|
||||||
|
pageOptions.removeTags.forEach((tag) => {
|
||||||
|
soup(tag).remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (pageOptions.onlyMainContent) {
|
if (pageOptions.onlyMainContent) {
|
||||||
// remove any other tags that are not in the main content
|
// remove any other tags that are not in the main content
|
||||||
excludeNonMainTags.forEach((tag) => {
|
excludeNonMainTags.forEach((tag) => {
|
||||||
|
|
|
@ -12,6 +12,7 @@ export async function getLinksFromSitemap(
|
||||||
content = response.data;
|
content = response.data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6,12 +6,14 @@ describe('replacePaths', () => {
|
||||||
it('should replace relative paths with absolute paths', () => {
|
it('should replace relative paths with absolute paths', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).'
|
content: 'This is a [link](/path/to/resource).',
|
||||||
|
markdown: 'This is a [link](/path/to/resource).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
const expectedDocuments: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).'
|
content: 'This is a [link](https://example.com/path/to/resource).',
|
||||||
|
markdown: 'This is a [link](https://example.com/path/to/resource).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
|
@ -21,7 +23,8 @@ describe('replacePaths', () => {
|
||||||
it('should not alter absolute URLs', () => {
|
it('should not alter absolute URLs', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).'
|
content: 'This is an [external link](https://external.com/path).',
|
||||||
|
markdown: 'This is an [external link](https://external.com/path).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
|
@ -31,7 +34,8 @@ describe('replacePaths', () => {
|
||||||
it('should not alter data URLs for images', () => {
|
it('should not alter data URLs for images', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'This is an image: ![alt text]().'
|
content: 'This is an image: ![alt text]().',
|
||||||
|
markdown: 'This is an image: ![alt text]().'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
|
@ -41,12 +45,14 @@ describe('replacePaths', () => {
|
||||||
it('should handle multiple links and images correctly', () => {
|
it('should handle multiple links and images correctly', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).'
|
content: 'Here are two links: [link1](/path1) and [link2](/path2).',
|
||||||
|
markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
const expectedDocuments: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).'
|
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
|
||||||
|
markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
|
@ -56,12 +62,14 @@ describe('replacePaths', () => {
|
||||||
it('should correctly handle a mix of absolute and relative paths', () => {
|
it('should correctly handle a mix of absolute and relative paths', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().'
|
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().',
|
||||||
|
markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
const expectedDocuments: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().'
|
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().',
|
||||||
|
markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
|
@ -74,12 +82,14 @@ describe('replacePaths', () => {
|
||||||
it('should replace relative image paths with absolute paths', () => {
|
it('should replace relative image paths with absolute paths', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Here is an image: ![alt text](/path/to/image.jpg).'
|
content: 'Here is an image: ![alt text](/path/to/image.jpg).',
|
||||||
|
markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
const expectedDocuments: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
|
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
|
||||||
|
markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
@ -89,7 +99,8 @@ describe('replacePaths', () => {
|
||||||
it('should not alter data:image URLs', () => {
|
it('should not alter data:image URLs', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'An image with a data URL: ![alt text]().'
|
content: 'An image with a data URL: ![alt text]().',
|
||||||
|
markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
@ -99,12 +110,14 @@ describe('replacePaths', () => {
|
||||||
it('should handle multiple images with a mix of data and relative URLs', () => {
|
it('should handle multiple images with a mix of data and relative URLs', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).'
|
content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).',
|
||||||
|
markdown: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
const expectedDocuments: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).'
|
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).',
|
||||||
|
markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
|
|
@ -10,6 +10,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
||||||
) || [];
|
) || [];
|
||||||
|
|
||||||
paths.forEach((path: string) => {
|
paths.forEach((path: string) => {
|
||||||
|
try {
|
||||||
const isImage = path.startsWith("!");
|
const isImage = path.startsWith("!");
|
||||||
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
|
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
|
||||||
let url = matchedUrl[1];
|
let url = matchedUrl[1];
|
||||||
|
@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
||||||
}
|
}
|
||||||
|
|
||||||
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
|
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
|
||||||
if (isImage) {
|
// Image is handled afterwards
|
||||||
document.content = document.content.replace(
|
if (!isImage) {
|
||||||
path,
|
|
||||||
`${markdownLinkOrImageText}(${url})`
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
document.content = document.content.replace(
|
document.content = document.content.replace(
|
||||||
path,
|
path,
|
||||||
`${markdownLinkOrImageText}(${url})`
|
`${markdownLinkOrImageText}(${url})`
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
document.markdown = document.content;
|
||||||
});
|
});
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
|
@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
||||||
if (!imageUrl.startsWith("http")) {
|
if (!imageUrl.startsWith("http")) {
|
||||||
if (imageUrl.startsWith("/")) {
|
if (imageUrl.startsWith("/")) {
|
||||||
imageUrl = imageUrl.substring(1);
|
imageUrl = imageUrl.substring(1);
|
||||||
}
|
|
||||||
imageUrl = new URL(imageUrl, baseUrl).toString();
|
imageUrl = new URL(imageUrl, baseUrl).toString();
|
||||||
|
} else {
|
||||||
|
imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
||||||
`![${altText}](${imageUrl})`
|
`![${altText}](${imageUrl})`
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
document.markdown = document.content;
|
||||||
});
|
});
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
|
|
|
@ -38,7 +38,7 @@ getWebScraperQueue().process(
|
||||||
error: message /* etc... */,
|
error: message /* etc... */,
|
||||||
};
|
};
|
||||||
|
|
||||||
await callWebhook(job.data.team_id, data);
|
await callWebhook(job.data.team_id, job.id as string, data);
|
||||||
|
|
||||||
await logJob({
|
await logJob({
|
||||||
success: success,
|
success: success,
|
||||||
|
@ -78,7 +78,7 @@ getWebScraperQueue().process(
|
||||||
error:
|
error:
|
||||||
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
"Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
|
||||||
};
|
};
|
||||||
await callWebhook(job.data.team_id, data);
|
await callWebhook(job.data.team_id, job.id as string, data);
|
||||||
await logJob({
|
await logJob({
|
||||||
success: false,
|
success: false,
|
||||||
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
|
message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
|
||||||
|
|
|
@ -1,8 +1,35 @@
|
||||||
import Redis from 'ioredis';
|
import Redis from "ioredis";
|
||||||
|
|
||||||
// Initialize Redis client
|
// Initialize Redis client
|
||||||
const redis = new Redis(process.env.REDIS_URL);
|
const redis = new Redis(process.env.REDIS_URL);
|
||||||
|
|
||||||
|
// Listen to 'error' events to the Redis connection
|
||||||
|
redis.on("error", (error) => {
|
||||||
|
try {
|
||||||
|
if (error.message === "ECONNRESET") {
|
||||||
|
console.log("Connection to Redis Session Store timed out.");
|
||||||
|
} else if (error.message === "ECONNREFUSED") {
|
||||||
|
console.log("Connection to Redis Session Store refused!");
|
||||||
|
} else console.log(error);
|
||||||
|
} catch (error) {}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Listen to 'reconnecting' event to Redis
|
||||||
|
redis.on("reconnecting", (err) => {
|
||||||
|
try {
|
||||||
|
if (redis.status === "reconnecting")
|
||||||
|
console.log("Reconnecting to Redis Session Store...");
|
||||||
|
else console.log("Error reconnecting to Redis Session Store.");
|
||||||
|
} catch (error) {}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Listen to the 'connect' event to Redis
|
||||||
|
redis.on("connect", (err) => {
|
||||||
|
try {
|
||||||
|
if (!err) console.log("Connected to Redis Session Store!");
|
||||||
|
} catch (error) {}
|
||||||
|
});
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set a value in Redis with an optional expiration time.
|
* Set a value in Redis with an optional expiration time.
|
||||||
* @param {string} key The key under which to store the value.
|
* @param {string} key The key under which to store the value.
|
||||||
|
@ -11,7 +38,7 @@ const redis = new Redis(process.env.REDIS_URL);
|
||||||
*/
|
*/
|
||||||
const setValue = async (key: string, value: string, expire?: number) => {
|
const setValue = async (key: string, value: string, expire?: number) => {
|
||||||
if (expire) {
|
if (expire) {
|
||||||
await redis.set(key, value, 'EX', expire);
|
await redis.set(key, value, "EX", expire);
|
||||||
} else {
|
} else {
|
||||||
await redis.set(key, value);
|
await redis.set(key, value);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,17 +1,19 @@
|
||||||
import { supabase_service } from "./supabase";
|
import { supabase_service } from "./supabase";
|
||||||
|
|
||||||
export const callWebhook = async (teamId: string, data: any) => {
|
export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
||||||
try {
|
try {
|
||||||
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
|
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
|
||||||
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
let webhookUrl = selfHostedUrl;
|
let webhookUrl = selfHostedUrl;
|
||||||
|
|
||||||
if (!selfHostedUrl) {
|
// Only fetch the webhook URL from the database if the self-hosted webhook URL is not set
|
||||||
|
// and the USE_DB_AUTHENTICATION environment variable is set to true
|
||||||
|
if (!selfHostedUrl && useDbAuthentication) {
|
||||||
const { data: webhooksData, error } = await supabase_service
|
const { data: webhooksData, error } = await supabase_service
|
||||||
.from("webhooks")
|
.from("webhooks")
|
||||||
.select("url")
|
.select("url")
|
||||||
.eq("team_id", teamId)
|
.eq("team_id", teamId)
|
||||||
.limit(1);
|
.limit(1);
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error(
|
console.error(
|
||||||
`Error fetching webhook URL for team ID: ${teamId}`,
|
`Error fetching webhook URL for team ID: ${teamId}`,
|
||||||
|
@ -45,6 +47,7 @@ export const callWebhook = async (teamId: string, data: any) => {
|
||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
success: data.success,
|
success: data.success,
|
||||||
|
jobId: jobId,
|
||||||
data: dataToSend,
|
data: dataToSend,
|
||||||
error: data.error || undefined,
|
error: data.error || undefined,
|
||||||
}),
|
}),
|
||||||
|
|
|
@ -1,3 +1,57 @@
|
||||||
|
"""
|
||||||
|
This is the Firecrawl package.
|
||||||
|
|
||||||
|
This package provides a Python SDK for interacting with the Firecrawl API.
|
||||||
|
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
||||||
|
and check the status of these jobs.
|
||||||
|
|
||||||
|
For more information visit https://github.com/firecrawl/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
from .firecrawl import FirecrawlApp
|
from .firecrawl import FirecrawlApp
|
||||||
|
|
||||||
__version__ = "0.0.14"
|
__version__ = "0.0.15"
|
||||||
|
|
||||||
|
# Define the logger for the Firecrawl project
|
||||||
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
||||||
|
|
||||||
|
def _basic_config() -> None:
|
||||||
|
"""Set up basic configuration for logging with a specific format and date format."""
|
||||||
|
try:
|
||||||
|
logging.basicConfig(
|
||||||
|
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to configure logging: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging() -> None:
|
||||||
|
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
||||||
|
env = os.environ.get(
|
||||||
|
"FIRECRAWL_LOGGING_LEVEL", "INFO"
|
||||||
|
).upper() # Default to 'INFO' level
|
||||||
|
_basic_config()
|
||||||
|
|
||||||
|
if env == "DEBUG":
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
elif env == "INFO":
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
elif env == "WARNING":
|
||||||
|
logger.setLevel(logging.WARNING)
|
||||||
|
elif env == "ERROR":
|
||||||
|
logger.setLevel(logging.ERROR)
|
||||||
|
elif env == "CRITICAL":
|
||||||
|
logger.setLevel(logging.CRITICAL)
|
||||||
|
else:
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize logging configuration when the module is imported
|
||||||
|
setup_logging()
|
||||||
|
logger.debug("Debugging logger setup")
|
||||||
|
|
|
@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes.
|
||||||
Classes:
|
Classes:
|
||||||
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
||||||
"""
|
"""
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
logger : logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
||||||
class FirecrawlApp:
|
class FirecrawlApp:
|
||||||
"""
|
"""
|
||||||
|
@ -28,8 +29,14 @@ class FirecrawlApp:
|
||||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||||
if self.api_key is None:
|
if self.api_key is None:
|
||||||
|
logger.warning("No API key provided")
|
||||||
raise ValueError('No API key provided')
|
raise ValueError('No API key provided')
|
||||||
|
else:
|
||||||
|
logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
|
||||||
|
|
||||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||||
|
if self.api_url != 'https://api.firecrawl.dev':
|
||||||
|
logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
|
||||||
|
|
||||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
|
|
Before Width: | Height: | Size: 7.8 KiB After Width: | Height: | Size: 7.8 KiB |
Before Width: | Height: | Size: 23 KiB After Width: | Height: | Size: 23 KiB |
Before Width: | Height: | Size: 7.0 KiB After Width: | Height: | Size: 7.0 KiB |
Before Width: | Height: | Size: 444 KiB After Width: | Height: | Size: 444 KiB |
Before Width: | Height: | Size: 492 B After Width: | Height: | Size: 492 B |
Before Width: | Height: | Size: 997 B After Width: | Height: | Size: 997 B |
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 262 KiB After Width: | Height: | Size: 262 KiB |
Before Width: | Height: | Size: 629 B After Width: | Height: | Size: 629 B |
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 15 KiB |
|
@ -1,4 +1,4 @@
|
||||||
import OpenAI from "openai";
|
import OpenAI from "openai/index.mjs";
|
||||||
import { encoding_for_model } from "@dqbd/tiktoken";
|
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||||
|
|
||||||
/**
|
/**
|