mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Merge branch 'main' into test/crawl-options
This commit is contained in:
commit
1601e93d69
|
@ -18,8 +18,8 @@
|
||||||
"paths": {
|
"paths": {
|
||||||
"/scrape": {
|
"/scrape": {
|
||||||
"post": {
|
"post": {
|
||||||
"summary": "Scrape a single URL",
|
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||||
"operationId": "scrapeSingleUrl",
|
"operationId": "scrapeAndExtractFromUrl",
|
||||||
"tags": ["Scraping"],
|
"tags": ["Scraping"],
|
||||||
"security": [
|
"security": [
|
||||||
{
|
{
|
||||||
|
@ -45,8 +45,43 @@
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||||
"default": false
|
"default": false
|
||||||
|
},
|
||||||
|
"includeHtml": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||||
|
"default": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"extractorOptions": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Options for LLM-based extraction of structured information from the page content",
|
||||||
|
"properties": {
|
||||||
|
"mode": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["llm-extraction"],
|
||||||
|
"description": "The extraction mode to use, currently supports 'llm-extraction'"
|
||||||
|
},
|
||||||
|
"extractionPrompt": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "A prompt describing what information to extract from the page"
|
||||||
|
},
|
||||||
|
"extractionSchema": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": true,
|
||||||
|
"description": "The schema for the data to be extracted",
|
||||||
|
"required": [
|
||||||
|
"company_mission",
|
||||||
|
"supports_sso",
|
||||||
|
"is_open_source"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"timeout": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Timeout in milliseconds for the request",
|
||||||
|
"default": 30000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": ["url"]
|
"required": ["url"]
|
||||||
|
@ -126,6 +161,16 @@
|
||||||
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
|
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
|
||||||
"default": false
|
"default": false
|
||||||
},
|
},
|
||||||
|
"maxDepth": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
|
||||||
|
},
|
||||||
|
"mode": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["default", "fast"],
|
||||||
|
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
||||||
|
"default": "default"
|
||||||
|
},
|
||||||
"limit": {
|
"limit": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Maximum number of pages to crawl",
|
"description": "Maximum number of pages to crawl",
|
||||||
|
@ -140,6 +185,11 @@
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||||
"default": false
|
"default": false
|
||||||
|
},
|
||||||
|
"includeHtml": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||||
|
"default": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -206,6 +256,11 @@
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
|
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
|
||||||
"default": true
|
"default": true
|
||||||
|
},
|
||||||
|
"includeHtml": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
|
||||||
|
"default": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -302,6 +357,63 @@
|
||||||
"$ref": "#/components/schemas/ScrapeResponse"
|
"$ref": "#/components/schemas/ScrapeResponse"
|
||||||
},
|
},
|
||||||
"description": "Data returned from the job (null when it is in progress)"
|
"description": "Data returned from the job (null when it is in progress)"
|
||||||
|
},
|
||||||
|
"partial_data": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/ScrapeResponse"
|
||||||
|
},
|
||||||
|
"description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"402": {
|
||||||
|
"description": "Payment required"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"description": "Too many requests"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"description": "Server error"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"/crawl/cancel/{jobId}": {
|
||||||
|
"delete": {
|
||||||
|
"tags": ["Crawl"],
|
||||||
|
"summary": "Cancel a crawl job",
|
||||||
|
"operationId": "cancelCrawlJob",
|
||||||
|
"security": [
|
||||||
|
{
|
||||||
|
"bearerAuth": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"parameters": [
|
||||||
|
{
|
||||||
|
"name": "jobId",
|
||||||
|
"in": "path",
|
||||||
|
"description": "ID of the crawl job",
|
||||||
|
"required": true,
|
||||||
|
"schema": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "Successful response",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"status": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Returns cancelled."
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -344,6 +456,11 @@
|
||||||
"content": {
|
"content": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
|
"html": {
|
||||||
|
"type": "string",
|
||||||
|
"nullable": true,
|
||||||
|
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||||
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
|
@ -660,6 +660,107 @@ describe("E2E Tests for API Routes", () => {
|
||||||
// }, 120000); // 120 secs
|
// }, 120000); // 120 secs
|
||||||
// });
|
// });
|
||||||
|
|
||||||
|
describe("POST /v0/crawl with fast mode", () => {
|
||||||
|
it("should complete the crawl under 20 seconds", async () => {
|
||||||
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
const crawlResponse = await request(TEST_URL)
|
||||||
|
.post("/v0/crawl")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({
|
||||||
|
url: "https://flutterbricks.com",
|
||||||
|
crawlerOptions: {
|
||||||
|
mode: "fast"
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
const jobId = crawlResponse.body.jobId;
|
||||||
|
let statusResponse;
|
||||||
|
let isFinished = false;
|
||||||
|
|
||||||
|
while (!isFinished) {
|
||||||
|
statusResponse = await request(TEST_URL)
|
||||||
|
.get(`/v0/crawl/status/${jobId}`)
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
expect(statusResponse.statusCode).toBe(200);
|
||||||
|
isFinished = statusResponse.body.status === "completed";
|
||||||
|
|
||||||
|
if (!isFinished) {
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const endTime = Date.now();
|
||||||
|
const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
|
||||||
|
|
||||||
|
console.log(`Time elapsed: ${timeElapsed} seconds`);
|
||||||
|
|
||||||
|
expect(statusResponse.body.status).toBe("completed");
|
||||||
|
expect(statusResponse.body).toHaveProperty("data");
|
||||||
|
expect(statusResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
expect(statusResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
const results = statusResponse.body.data;
|
||||||
|
// results.forEach((result, i) => {
|
||||||
|
// console.log(result.metadata.sourceURL);
|
||||||
|
// });
|
||||||
|
expect(results.length).toBeGreaterThanOrEqual(10);
|
||||||
|
expect(results.length).toBeLessThanOrEqual(15);
|
||||||
|
|
||||||
|
}, 20000);
|
||||||
|
|
||||||
|
// it("should complete the crawl in more than 10 seconds", async () => {
|
||||||
|
// const startTime = Date.now();
|
||||||
|
|
||||||
|
// const crawlResponse = await request(TEST_URL)
|
||||||
|
// .post("/v0/crawl")
|
||||||
|
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
// .set("Content-Type", "application/json")
|
||||||
|
// .send({
|
||||||
|
// url: "https://flutterbricks.com",
|
||||||
|
// });
|
||||||
|
|
||||||
|
// expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
|
// const jobId = crawlResponse.body.jobId;
|
||||||
|
// let statusResponse;
|
||||||
|
// let isFinished = false;
|
||||||
|
|
||||||
|
// while (!isFinished) {
|
||||||
|
// statusResponse = await request(TEST_URL)
|
||||||
|
// .get(`/v0/crawl/status/${jobId}`)
|
||||||
|
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||||
|
|
||||||
|
// expect(statusResponse.statusCode).toBe(200);
|
||||||
|
// isFinished = statusResponse.body.status === "completed";
|
||||||
|
|
||||||
|
// if (!isFinished) {
|
||||||
|
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// const endTime = Date.now();
|
||||||
|
// const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
|
||||||
|
|
||||||
|
// console.log(`Time elapsed: ${timeElapsed} seconds`);
|
||||||
|
|
||||||
|
// expect(statusResponse.body.status).toBe("completed");
|
||||||
|
// expect(statusResponse.body).toHaveProperty("data");
|
||||||
|
// expect(statusResponse.body.data[0]).toHaveProperty("content");
|
||||||
|
// expect(statusResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
|
// const results = statusResponse.body.data;
|
||||||
|
// // results.forEach((result, i) => {
|
||||||
|
// // console.log(result.metadata.sourceURL);
|
||||||
|
// // });
|
||||||
|
// expect(results.length).toBeGreaterThanOrEqual(10);
|
||||||
|
// expect(results.length).toBeLessThanOrEqual(15);
|
||||||
|
|
||||||
|
// }, 50000);// 15 seconds timeout to account for network delays
|
||||||
|
});
|
||||||
|
|
||||||
describe("GET /is-production", () => {
|
describe("GET /is-production", () => {
|
||||||
it("should return the production status", async () => {
|
it("should return the production status", async () => {
|
||||||
const response = await request(TEST_URL).get("/is-production");
|
const response = await request(TEST_URL).get("/is-production");
|
||||||
|
|
|
@ -44,6 +44,7 @@ export type WebScraperOptions = {
|
||||||
limit?: number;
|
limit?: number;
|
||||||
generateImgAltText?: boolean;
|
generateImgAltText?: boolean;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
|
mode?: "default" | "fast"; // have a mode of some sort
|
||||||
};
|
};
|
||||||
pageOptions?: PageOptions;
|
pageOptions?: PageOptions;
|
||||||
extractorOptions?: ExtractorOptions;
|
extractorOptions?: ExtractorOptions;
|
||||||
|
|
|
@ -4,7 +4,7 @@ import { URL } from "url";
|
||||||
import { getLinksFromSitemap } from "./sitemap";
|
import { getLinksFromSitemap } from "./sitemap";
|
||||||
import async from "async";
|
import async from "async";
|
||||||
import { Progress } from "../../lib/entities";
|
import { Progress } from "../../lib/entities";
|
||||||
import { scrapWithScrapingBee } from "./single_url";
|
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
|
@ -15,7 +15,7 @@ export class WebCrawler {
|
||||||
private maxCrawledLinks: number;
|
private maxCrawledLinks: number;
|
||||||
private maxCrawledDepth: number;
|
private maxCrawledDepth: number;
|
||||||
private visited: Set<string> = new Set();
|
private visited: Set<string> = new Set();
|
||||||
private crawledUrls: Set<string> = new Set();
|
private crawledUrls: Map<string, string> = new Map();
|
||||||
private limit: number;
|
private limit: number;
|
||||||
private robotsTxtUrl: string;
|
private robotsTxtUrl: string;
|
||||||
private robots: any;
|
private robots: any;
|
||||||
|
@ -51,7 +51,6 @@ export class WebCrawler {
|
||||||
this.generateImgAltText = generateImgAltText ?? false;
|
this.generateImgAltText = generateImgAltText ?? false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||||
return sitemapLinks
|
return sitemapLinks
|
||||||
.filter((link) => {
|
.filter((link) => {
|
||||||
|
@ -99,7 +98,7 @@ export class WebCrawler {
|
||||||
concurrencyLimit: number = 5,
|
concurrencyLimit: number = 5,
|
||||||
limit: number = 10000,
|
limit: number = 10000,
|
||||||
maxDepth: number = 10
|
maxDepth: number = 10
|
||||||
): Promise<string[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
// Fetch and parse robots.txt
|
// Fetch and parse robots.txt
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(this.robotsTxtUrl);
|
const response = await axios.get(this.robotsTxtUrl);
|
||||||
|
@ -111,7 +110,7 @@ export class WebCrawler {
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
return filteredLinks;
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
|
|
||||||
const urls = await this.crawlUrls(
|
const urls = await this.crawlUrls(
|
||||||
|
@ -123,18 +122,19 @@ export class WebCrawler {
|
||||||
urls.length === 0 &&
|
urls.length === 0 &&
|
||||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||||
) {
|
) {
|
||||||
return [this.initialUrl];
|
return [{ url: this.initialUrl, html: "" }];
|
||||||
}
|
}
|
||||||
|
|
||||||
// make sure to run include exclude here again
|
// make sure to run include exclude here again
|
||||||
return this.filterLinks(urls, limit, this.maxCrawledDepth);
|
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
||||||
|
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
||||||
}
|
}
|
||||||
|
|
||||||
private async crawlUrls(
|
private async crawlUrls(
|
||||||
urls: string[],
|
urls: string[],
|
||||||
concurrencyLimit: number,
|
concurrencyLimit: number,
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<string[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
const queue = async.queue(async (task: string, callback) => {
|
const queue = async.queue(async (task: string, callback) => {
|
||||||
if (this.crawledUrls.size >= this.maxCrawledLinks) {
|
if (this.crawledUrls.size >= this.maxCrawledLinks) {
|
||||||
if (callback && typeof callback === "function") {
|
if (callback && typeof callback === "function") {
|
||||||
|
@ -143,13 +143,13 @@ export class WebCrawler {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const newUrls = await this.crawl(task);
|
const newUrls = await this.crawl(task);
|
||||||
newUrls.forEach((url) => this.crawledUrls.add(url));
|
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
||||||
if (inProgress && newUrls.length > 0) {
|
if (inProgress && newUrls.length > 0) {
|
||||||
inProgress({
|
inProgress({
|
||||||
current: this.crawledUrls.size,
|
current: this.crawledUrls.size,
|
||||||
total: this.maxCrawledLinks,
|
total: this.maxCrawledLinks,
|
||||||
status: "SCRAPING",
|
status: "SCRAPING",
|
||||||
currentDocumentUrl: newUrls[newUrls.length - 1],
|
currentDocumentUrl: newUrls[newUrls.length - 1].url,
|
||||||
});
|
});
|
||||||
} else if (inProgress) {
|
} else if (inProgress) {
|
||||||
inProgress({
|
inProgress({
|
||||||
|
@ -159,7 +159,7 @@ export class WebCrawler {
|
||||||
currentDocumentUrl: task,
|
currentDocumentUrl: task,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
await this.crawlUrls(newUrls, concurrencyLimit, inProgress);
|
await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
|
||||||
if (callback && typeof callback === "function") {
|
if (callback && typeof callback === "function") {
|
||||||
callback();
|
callback();
|
||||||
}
|
}
|
||||||
|
@ -175,10 +175,10 @@ export class WebCrawler {
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
await queue.drain();
|
await queue.drain();
|
||||||
return Array.from(this.crawledUrls);
|
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl(url: string): Promise<string[]> {
|
async crawl(url: string): Promise<{url: string, html: string}[]> {
|
||||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent"))
|
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent"))
|
||||||
return [];
|
return [];
|
||||||
this.visited.add(url);
|
this.visited.add(url);
|
||||||
|
@ -193,16 +193,17 @@ export class WebCrawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
let content;
|
let content : string = "";
|
||||||
// If it is the first link, fetch with scrapingbee
|
// If it is the first link, fetch with single url
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
content = await scrapWithScrapingBee(url, "load");
|
const page = await scrapSingleUrl(url, {includeHtml: true});
|
||||||
|
content = page.html ?? ""
|
||||||
} else {
|
} else {
|
||||||
const response = await axios.get(url);
|
const response = await axios.get(url);
|
||||||
content = response.data;
|
content = response.data ?? "";
|
||||||
}
|
}
|
||||||
const $ = load(content);
|
const $ = load(content);
|
||||||
let links: string[] = [];
|
let links: {url: string, html: string}[] = [];
|
||||||
|
|
||||||
$("a").each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
const href = $(element).attr("href");
|
const href = $(element).attr("href");
|
||||||
|
@ -215,7 +216,6 @@ export class WebCrawler {
|
||||||
const path = url.pathname;
|
const path = url.pathname;
|
||||||
|
|
||||||
if (
|
if (
|
||||||
// fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url
|
|
||||||
this.isInternalLink(fullUrl) &&
|
this.isInternalLink(fullUrl) &&
|
||||||
this.matchesPattern(fullUrl) &&
|
this.matchesPattern(fullUrl) &&
|
||||||
this.noSections(fullUrl) &&
|
this.noSections(fullUrl) &&
|
||||||
|
@ -223,12 +223,13 @@ export class WebCrawler {
|
||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
this.robots.isAllowed(fullUrl, "FireCrawlAgent")
|
||||||
) {
|
) {
|
||||||
links.push(fullUrl);
|
links.push({url: fullUrl, html: content});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
return links.filter((link) => !this.visited.has(link));
|
// Create a new list to return to avoid modifying the visited list
|
||||||
|
return links.filter((link) => !this.visited.has(link.url));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,6 +35,7 @@ export class WebScraperDataProvider {
|
||||||
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
private replaceAllPathsWithAbsolutePaths?: boolean = false;
|
||||||
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
|
||||||
"gpt-4-turbo";
|
"gpt-4-turbo";
|
||||||
|
private crawlerMode: string = "default";
|
||||||
|
|
||||||
authorize(): void {
|
authorize(): void {
|
||||||
throw new Error("Method not implemented.");
|
throw new Error("Method not implemented.");
|
||||||
|
@ -46,7 +47,8 @@ export class WebScraperDataProvider {
|
||||||
|
|
||||||
private async convertUrlsToDocuments(
|
private async convertUrlsToDocuments(
|
||||||
urls: string[],
|
urls: string[],
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void,
|
||||||
|
allHtmls?: string[]
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
const totalUrls = urls.length;
|
const totalUrls = urls.length;
|
||||||
let processedUrls = 0;
|
let processedUrls = 0;
|
||||||
|
@ -56,7 +58,8 @@ export class WebScraperDataProvider {
|
||||||
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
const batchUrls = urls.slice(i, i + this.concurrentRequests);
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
batchUrls.map(async (url, index) => {
|
batchUrls.map(async (url, index) => {
|
||||||
const result = await scrapSingleUrl(url, this.pageOptions);
|
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||||
|
const result = await scrapSingleUrl(url, this.pageOptions, existingHTML);
|
||||||
processedUrls++;
|
processedUrls++;
|
||||||
if (inProgress) {
|
if (inProgress) {
|
||||||
inProgress({
|
inProgress({
|
||||||
|
@ -139,13 +142,26 @@ export class WebScraperDataProvider {
|
||||||
limit: this.limit,
|
limit: this.limit,
|
||||||
generateImgAltText: this.generateImgAltText,
|
generateImgAltText: this.generateImgAltText,
|
||||||
});
|
});
|
||||||
|
|
||||||
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
|
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
|
||||||
|
|
||||||
|
const allLinks = links.map((e) => e.url);
|
||||||
|
const allHtmls = links.map((e)=> e.html);
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
return this.returnOnlyUrlsResponse(links, inProgress);
|
return this.returnOnlyUrlsResponse(allLinks , inProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
let documents = [];
|
||||||
|
// check if fast mode is enabled and there is html inside the links
|
||||||
|
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
||||||
|
console.log("Fast mode enabled");
|
||||||
|
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||||
|
}else{
|
||||||
|
documents = await this.processLinks(allLinks, inProgress);
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = await this.processLinks(links, inProgress);
|
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
||||||
return this.cacheAndFinalizeDocuments(documents, links);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private async handleSingleUrlsMode(
|
private async handleSingleUrlsMode(
|
||||||
|
@ -187,14 +203,17 @@ export class WebScraperDataProvider {
|
||||||
|
|
||||||
private async processLinks(
|
private async processLinks(
|
||||||
links: string[],
|
links: string[],
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void,
|
||||||
|
allHtmls?: string[]
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
|
||||||
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
|
||||||
links = links.filter((link) => !link.endsWith(".pdf"));
|
links = links.filter((link) => !link.endsWith(".pdf"));
|
||||||
|
|
||||||
let documents = await this.convertUrlsToDocuments(links, inProgress);
|
let documents = await this.convertUrlsToDocuments(links, inProgress, allHtmls);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
|
|
||||||
|
|
||||||
documents = this.applyPathReplacements(documents);
|
documents = this.applyPathReplacements(documents);
|
||||||
// documents = await this.applyImgAltText(documents);
|
// documents = await this.applyImgAltText(documents);
|
||||||
|
|
||||||
|
@ -397,6 +416,7 @@ export class WebScraperDataProvider {
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
|
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||||
|
|
||||||
// make sure all urls start with https://
|
// make sure all urls start with https://
|
||||||
this.urls = this.urls.map((url) => {
|
this.urls = this.urls.map((url) => {
|
||||||
|
|
|
@ -106,7 +106,8 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
|
||||||
|
|
||||||
export async function scrapSingleUrl(
|
export async function scrapSingleUrl(
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false }
|
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
|
||||||
|
existingHtml: string = ""
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
urlToScrap = urlToScrap.trim();
|
urlToScrap = urlToScrap.trim();
|
||||||
|
|
||||||
|
@ -197,8 +198,15 @@ export async function scrapSingleUrl(
|
||||||
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
|
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
|
||||||
|
|
||||||
for (const scraper of scrapersInOrder) {
|
for (const scraper of scrapersInOrder) {
|
||||||
|
// If exists text coming from crawler, use it
|
||||||
|
if (existingHtml && existingHtml.trim().length >= 100) {
|
||||||
|
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
|
||||||
|
text = await parseMarkdown(cleanedHtml);
|
||||||
|
html = existingHtml;
|
||||||
|
break;
|
||||||
|
}
|
||||||
[text, html] = await attemptScraping(urlToScrap, scraper);
|
[text, html] = await attemptScraping(urlToScrap, scraper);
|
||||||
if (text && text.length >= 100) break;
|
if (text && text.trim().length >= 100) break;
|
||||||
console.log(`Falling back to ${scraper}`);
|
console.log(`Falling back to ${scraper}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ getWebScraperQueue().process(
|
||||||
success: success,
|
success: success,
|
||||||
result: {
|
result: {
|
||||||
links: docs.map((doc) => {
|
links: docs.map((doc) => {
|
||||||
return { content: doc, source: doc.metadata.sourceURL };
|
return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
project_id: job.data.project_id,
|
project_id: job.data.project_id,
|
||||||
|
|
|
@ -176,7 +176,7 @@ describe("Scraping Checkup (E2E)", () => {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
expect(score).toBeGreaterThanOrEqual(75);
|
expect(score).toBeGreaterThanOrEqual(70);
|
||||||
}, 350000); // 150 seconds timeout
|
}, 350000); // 150 seconds timeout
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
Loading…
Reference in New Issue
Block a user