Merge branch 'main' into test/crawl-options

This commit is contained in:
Nicolas 2024-05-15 12:34:47 -07:00
commit 1601e93d69
8 changed files with 283 additions and 35 deletions

View File

@ -18,8 +18,8 @@
"paths": { "paths": {
"/scrape": { "/scrape": {
"post": { "post": {
"summary": "Scrape a single URL", "summary": "Scrape a single URL and optionally extract information using an LLM",
"operationId": "scrapeSingleUrl", "operationId": "scrapeAndExtractFromUrl",
"tags": ["Scraping"], "tags": ["Scraping"],
"security": [ "security": [
{ {
@ -45,8 +45,43 @@
"type": "boolean", "type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.", "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false "default": false
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
},
"extractorOptions": {
"type": "object",
"description": "Options for LLM-based extraction of structured information from the page content",
"properties": {
"mode": {
"type": "string",
"enum": ["llm-extraction"],
"description": "The extraction mode to use, currently supports 'llm-extraction'"
},
"extractionPrompt": {
"type": "string",
"description": "A prompt describing what information to extract from the page"
},
"extractionSchema": {
"type": "object",
"additionalProperties": true,
"description": "The schema for the data to be extracted",
"required": [
"company_mission",
"supports_sso",
"is_open_source"
]
}
}
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
} }
}, },
"required": ["url"] "required": ["url"]
@ -126,6 +161,16 @@
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.", "description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
"default": false "default": false
}, },
"maxDepth": {
"type": "integer",
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
},
"mode": {
"type": "string",
"enum": ["default", "fast"],
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
"default": "default"
},
"limit": { "limit": {
"type": "integer", "type": "integer",
"description": "Maximum number of pages to crawl", "description": "Maximum number of pages to crawl",
@ -140,6 +185,11 @@
"type": "boolean", "type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.", "description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false "default": false
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
} }
@ -206,6 +256,11 @@
"type": "boolean", "type": "boolean",
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.", "description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
"default": true "default": true
},
"includeHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a html key in the response.",
"default": false
} }
} }
}, },
@ -302,6 +357,63 @@
"$ref": "#/components/schemas/ScrapeResponse" "$ref": "#/components/schemas/ScrapeResponse"
}, },
"description": "Data returned from the job (null when it is in progress)" "description": "Data returned from the job (null when it is in progress)"
},
"partial_data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ScrapeResponse"
},
"description": "Partial documents returned as it is being crawls (streaming). When a page is ready it will append to the parial_data array - so no need to wait for all the website to be crawled."
}
}
}
}
}
},
"402": {
"description": "Payment required"
},
"429": {
"description": "Too many requests"
},
"500": {
"description": "Server error"
}
}
}
},
"/crawl/cancel/{jobId}": {
"delete": {
"tags": ["Crawl"],
"summary": "Cancel a crawl job",
"operationId": "cancelCrawlJob",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Returns cancelled."
} }
} }
} }
@ -344,6 +456,11 @@
"content": { "content": {
"type": "string" "type": "string"
}, },
"html": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true"
},
"metadata": { "metadata": {
"type": "object", "type": "object",
"properties": { "properties": {

View File

@ -660,6 +660,107 @@ describe("E2E Tests for API Routes", () => {
// }, 120000); // 120 secs // }, 120000); // 120 secs
// }); // });
describe("POST /v0/crawl with fast mode", () => {
it("should complete the crawl under 20 seconds", async () => {
const startTime = Date.now();
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://flutterbricks.com",
crawlerOptions: {
mode: "fast"
}
});
expect(crawlResponse.statusCode).toBe(200);
const jobId = crawlResponse.body.jobId;
let statusResponse;
let isFinished = false;
while (!isFinished) {
statusResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(statusResponse.statusCode).toBe(200);
isFinished = statusResponse.body.status === "completed";
if (!isFinished) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const endTime = Date.now();
const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
console.log(`Time elapsed: ${timeElapsed} seconds`);
expect(statusResponse.body.status).toBe("completed");
expect(statusResponse.body).toHaveProperty("data");
expect(statusResponse.body.data[0]).toHaveProperty("content");
expect(statusResponse.body.data[0]).toHaveProperty("markdown");
const results = statusResponse.body.data;
// results.forEach((result, i) => {
// console.log(result.metadata.sourceURL);
// });
expect(results.length).toBeGreaterThanOrEqual(10);
expect(results.length).toBeLessThanOrEqual(15);
}, 20000);
// it("should complete the crawl in more than 10 seconds", async () => {
// const startTime = Date.now();
// const crawlResponse = await request(TEST_URL)
// .post("/v0/crawl")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send({
// url: "https://flutterbricks.com",
// });
// expect(crawlResponse.statusCode).toBe(200);
// const jobId = crawlResponse.body.jobId;
// let statusResponse;
// let isFinished = false;
// while (!isFinished) {
// statusResponse = await request(TEST_URL)
// .get(`/v0/crawl/status/${jobId}`)
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
// expect(statusResponse.statusCode).toBe(200);
// isFinished = statusResponse.body.status === "completed";
// if (!isFinished) {
// await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
// }
// }
// const endTime = Date.now();
// const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds
// console.log(`Time elapsed: ${timeElapsed} seconds`);
// expect(statusResponse.body.status).toBe("completed");
// expect(statusResponse.body).toHaveProperty("data");
// expect(statusResponse.body.data[0]).toHaveProperty("content");
// expect(statusResponse.body.data[0]).toHaveProperty("markdown");
// const results = statusResponse.body.data;
// // results.forEach((result, i) => {
// // console.log(result.metadata.sourceURL);
// // });
// expect(results.length).toBeGreaterThanOrEqual(10);
// expect(results.length).toBeLessThanOrEqual(15);
// }, 50000);// 15 seconds timeout to account for network delays
});
describe("GET /is-production", () => { describe("GET /is-production", () => {
it("should return the production status", async () => { it("should return the production status", async () => {
const response = await request(TEST_URL).get("/is-production"); const response = await request(TEST_URL).get("/is-production");

View File

@ -44,6 +44,7 @@ export type WebScraperOptions = {
limit?: number; limit?: number;
generateImgAltText?: boolean; generateImgAltText?: boolean;
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
mode?: "default" | "fast"; // have a mode of some sort
}; };
pageOptions?: PageOptions; pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions; extractorOptions?: ExtractorOptions;

View File

@ -4,7 +4,7 @@ import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
import async from "async"; import async from "async";
import { Progress } from "../../lib/entities"; import { Progress } from "../../lib/entities";
import { scrapWithScrapingBee } from "./single_url"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
export class WebCrawler { export class WebCrawler {
@ -15,7 +15,7 @@ export class WebCrawler {
private maxCrawledLinks: number; private maxCrawledLinks: number;
private maxCrawledDepth: number; private maxCrawledDepth: number;
private visited: Set<string> = new Set(); private visited: Set<string> = new Set();
private crawledUrls: Set<string> = new Set(); private crawledUrls: Map<string, string> = new Map();
private limit: number; private limit: number;
private robotsTxtUrl: string; private robotsTxtUrl: string;
private robots: any; private robots: any;
@ -51,7 +51,6 @@ export class WebCrawler {
this.generateImgAltText = generateImgAltText ?? false; this.generateImgAltText = generateImgAltText ?? false;
} }
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
return sitemapLinks return sitemapLinks
.filter((link) => { .filter((link) => {
@ -99,7 +98,7 @@ export class WebCrawler {
concurrencyLimit: number = 5, concurrencyLimit: number = 5,
limit: number = 10000, limit: number = 10000,
maxDepth: number = 10 maxDepth: number = 10
): Promise<string[]> { ): Promise<{ url: string, html: string }[]> {
// Fetch and parse robots.txt // Fetch and parse robots.txt
try { try {
const response = await axios.get(this.robotsTxtUrl); const response = await axios.get(this.robotsTxtUrl);
@ -111,7 +110,7 @@ export class WebCrawler {
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); const filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks; return filteredLinks.map(link => ({ url: link, html: "" }));
} }
const urls = await this.crawlUrls( const urls = await this.crawlUrls(
@ -123,18 +122,19 @@ export class WebCrawler {
urls.length === 0 && urls.length === 0 &&
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
) { ) {
return [this.initialUrl]; return [{ url: this.initialUrl, html: "" }];
} }
// make sure to run include exclude here again // make sure to run include exclude here again
return this.filterLinks(urls, limit, this.maxCrawledDepth); const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
} }
private async crawlUrls( private async crawlUrls(
urls: string[], urls: string[],
concurrencyLimit: number, concurrencyLimit: number,
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<string[]> { ): Promise<{ url: string, html: string }[]> {
const queue = async.queue(async (task: string, callback) => { const queue = async.queue(async (task: string, callback) => {
if (this.crawledUrls.size >= this.maxCrawledLinks) { if (this.crawledUrls.size >= this.maxCrawledLinks) {
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
@ -143,13 +143,13 @@ export class WebCrawler {
return; return;
} }
const newUrls = await this.crawl(task); const newUrls = await this.crawl(task);
newUrls.forEach((url) => this.crawledUrls.add(url)); newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
if (inProgress && newUrls.length > 0) { if (inProgress && newUrls.length > 0) {
inProgress({ inProgress({
current: this.crawledUrls.size, current: this.crawledUrls.size,
total: this.maxCrawledLinks, total: this.maxCrawledLinks,
status: "SCRAPING", status: "SCRAPING",
currentDocumentUrl: newUrls[newUrls.length - 1], currentDocumentUrl: newUrls[newUrls.length - 1].url,
}); });
} else if (inProgress) { } else if (inProgress) {
inProgress({ inProgress({
@ -159,7 +159,7 @@ export class WebCrawler {
currentDocumentUrl: task, currentDocumentUrl: task,
}); });
} }
await this.crawlUrls(newUrls, concurrencyLimit, inProgress); await this.crawlUrls(newUrls.map((p) => p.url), concurrencyLimit, inProgress);
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
@ -175,10 +175,10 @@ export class WebCrawler {
} }
); );
await queue.drain(); await queue.drain();
return Array.from(this.crawledUrls); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
async crawl(url: string): Promise<string[]> { async crawl(url: string): Promise<{url: string, html: string}[]> {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent"))
return []; return [];
this.visited.add(url); this.visited.add(url);
@ -193,16 +193,17 @@ export class WebCrawler {
} }
try { try {
let content; let content : string = "";
// If it is the first link, fetch with scrapingbee // If it is the first link, fetch with single url
if (this.visited.size === 1) { if (this.visited.size === 1) {
content = await scrapWithScrapingBee(url, "load"); const page = await scrapSingleUrl(url, {includeHtml: true});
content = page.html ?? ""
} else { } else {
const response = await axios.get(url); const response = await axios.get(url);
content = response.data; content = response.data ?? "";
} }
const $ = load(content); const $ = load(content);
let links: string[] = []; let links: {url: string, html: string}[] = [];
$("a").each((_, element) => { $("a").each((_, element) => {
const href = $(element).attr("href"); const href = $(element).attr("href");
@ -215,7 +216,6 @@ export class WebCrawler {
const path = url.pathname; const path = url.pathname;
if ( if (
// fullUrl.startsWith(this.initialUrl) && // this condition makes it stop crawling back the url
this.isInternalLink(fullUrl) && this.isInternalLink(fullUrl) &&
this.matchesPattern(fullUrl) && this.matchesPattern(fullUrl) &&
this.noSections(fullUrl) && this.noSections(fullUrl) &&
@ -223,12 +223,13 @@ export class WebCrawler {
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.robots.isAllowed(fullUrl, "FireCrawlAgent") this.robots.isAllowed(fullUrl, "FireCrawlAgent")
) { ) {
links.push(fullUrl); links.push({url: fullUrl, html: content});
} }
} }
}); });
return links.filter((link) => !this.visited.has(link)); // Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(link.url));
} catch (error) { } catch (error) {
return []; return [];
} }

View File

@ -35,6 +35,7 @@ export class WebScraperDataProvider {
private replaceAllPathsWithAbsolutePaths?: boolean = false; private replaceAllPathsWithAbsolutePaths?: boolean = false;
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
"gpt-4-turbo"; "gpt-4-turbo";
private crawlerMode: string = "default";
authorize(): void { authorize(): void {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
@ -46,7 +47,8 @@ export class WebScraperDataProvider {
private async convertUrlsToDocuments( private async convertUrlsToDocuments(
urls: string[], urls: string[],
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void,
allHtmls?: string[]
): Promise<Document[]> { ): Promise<Document[]> {
const totalUrls = urls.length; const totalUrls = urls.length;
let processedUrls = 0; let processedUrls = 0;
@ -56,7 +58,8 @@ export class WebScraperDataProvider {
const batchUrls = urls.slice(i, i + this.concurrentRequests); const batchUrls = urls.slice(i, i + this.concurrentRequests);
await Promise.all( await Promise.all(
batchUrls.map(async (url, index) => { batchUrls.map(async (url, index) => {
const result = await scrapSingleUrl(url, this.pageOptions); const existingHTML = allHtmls ? allHtmls[i + index] : "";
const result = await scrapSingleUrl(url, this.pageOptions, existingHTML);
processedUrls++; processedUrls++;
if (inProgress) { if (inProgress) {
inProgress({ inProgress({
@ -139,13 +142,26 @@ export class WebScraperDataProvider {
limit: this.limit, limit: this.limit,
generateImgAltText: this.generateImgAltText, generateImgAltText: this.generateImgAltText,
}); });
let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth); let links = await crawler.start(inProgress, 5, this.limit, this.maxCrawledDepth);
const allLinks = links.map((e) => e.url);
const allHtmls = links.map((e)=> e.html);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(links, inProgress); return this.returnOnlyUrlsResponse(allLinks , inProgress);
}
let documents = [];
// check if fast mode is enabled and there is html inside the links
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
console.log("Fast mode enabled");
documents = await this.processLinks(allLinks, inProgress, allHtmls);
}else{
documents = await this.processLinks(allLinks, inProgress);
} }
let documents = await this.processLinks(links, inProgress); return this.cacheAndFinalizeDocuments(documents, allLinks);
return this.cacheAndFinalizeDocuments(documents, links);
} }
private async handleSingleUrlsMode( private async handleSingleUrlsMode(
@ -187,14 +203,17 @@ export class WebScraperDataProvider {
private async processLinks( private async processLinks(
links: string[], links: string[],
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void,
allHtmls?: string[]
): Promise<Document[]> { ): Promise<Document[]> {
let pdfLinks = links.filter((link) => link.endsWith(".pdf")); let pdfLinks = links.filter((link) => link.endsWith(".pdf"));
let pdfDocuments = await this.fetchPdfDocuments(pdfLinks); let pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
links = links.filter((link) => !link.endsWith(".pdf")); links = links.filter((link) => !link.endsWith(".pdf"));
let documents = await this.convertUrlsToDocuments(links, inProgress); let documents = await this.convertUrlsToDocuments(links, inProgress, allHtmls);
documents = await this.getSitemapData(this.urls[0], documents); documents = await this.getSitemapData(this.urls[0], documents);
documents = this.applyPathReplacements(documents); documents = this.applyPathReplacements(documents);
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
@ -397,6 +416,7 @@ export class WebScraperDataProvider {
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false; this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check
this.excludes = this.excludes.filter((item) => item !== ""); this.excludes = this.excludes.filter((item) => item !== "");
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
// make sure all urls start with https:// // make sure all urls start with https://
this.urls = this.urls.map((url) => { this.urls = this.urls.map((url) => {

View File

@ -106,7 +106,8 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false } pageOptions: PageOptions = { onlyMainContent: true, includeHtml: false },
existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
urlToScrap = urlToScrap.trim(); urlToScrap = urlToScrap.trim();
@ -197,8 +198,15 @@ export async function scrapSingleUrl(
: ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"]; : ["scrapingBee", "playwright", "scrapingBeeLoad", "fetch"];
for (const scraper of scrapersInOrder) { for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it
if (existingHtml && existingHtml.trim().length >= 100) {
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
text = await parseMarkdown(cleanedHtml);
html = existingHtml;
break;
}
[text, html] = await attemptScraping(urlToScrap, scraper); [text, html] = await attemptScraping(urlToScrap, scraper);
if (text && text.length >= 100) break; if (text && text.trim().length >= 100) break;
console.log(`Falling back to ${scraper}`); console.log(`Falling back to ${scraper}`);
} }

View File

@ -26,7 +26,7 @@ getWebScraperQueue().process(
success: success, success: success,
result: { result: {
links: docs.map((doc) => { links: docs.map((doc) => {
return { content: doc, source: doc.metadata.sourceURL }; return { content: doc, source: doc?.metadata?.sourceURL ?? doc?.url ?? "" };
}), }),
}, },
project_id: job.data.project_id, project_id: job.data.project_id,

View File

@ -176,7 +176,7 @@ describe("Scraping Checkup (E2E)", () => {
} }
expect(score).toBeGreaterThanOrEqual(75); expect(score).toBeGreaterThanOrEqual(70);
}, 350000); // 150 seconds timeout }, 350000); // 150 seconds timeout
}); });
}); });