Merge branch 'main' into nsc/pay-as-you-go-lw2

This commit is contained in:
Nicolas 2024-10-24 22:31:04 -03:00
commit 29b34270c8
28 changed files with 784 additions and 58 deletions

1
.gitignore vendored
View File

@ -29,3 +29,4 @@ apps/js-sdk/firecrawl/dist
/examples/o1_web_crawler/firecrawl_env
/examples/crm_lead_enrichment/crm_lead_enrichment_env
/.venv
/examples/claude_web_crawler/firecrawl_env

View File

@ -13,7 +13,7 @@ import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
import { sendNotification } from "../services/notification/email_notification";
import { Logger } from "../lib/logger";
import { redlock } from "../services/redlock";
import { getValue } from "../services/redis";
import { deleteKey, getValue } from "../services/redis";
import { setValue } from "../services/redis";
import { validate } from "uuid";
import * as Sentry from "@sentry/node";
@ -134,6 +134,13 @@ export async function getACUC(
}
}
export async function clearACUC(
api_key: string,
): Promise<void> {
const cacheKeyACUC = `acuc_${api_key}`;
await deleteKey(cacheKeyACUC);
}
export async function authenticateUser(
req,
res,

View File

@ -0,0 +1,22 @@
import { Request, Response } from "express";
import { supabase_service } from "../../../services/supabase";
import { clearACUC } from "../../auth";
import { Logger } from "../../../lib/logger";
export async function acucCacheClearController(req: Request, res: Response) {
try {
const team_id: string = req.body.team_id;
const keys = await supabase_service
.from("api_keys")
.select("*")
.eq("team_id", team_id);
await Promise.all(keys.data.map((x) => clearACUC(x.key)));
res.json({ ok: true });
} catch (error) {
Logger.error(`Error clearing ACUC cache via API route: ${error}`);
res.status(500).json({ error: "Internal server error" });
}
}

View File

@ -0,0 +1,99 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
BatchScrapeRequest,
batchScrapeRequestSchema,
CrawlResponse,
legacyScrapeOptions,
RequestWithAuth,
} from "./types";
import {
addCrawlJobs,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { getJobPriority } from "../../lib/job-priority";
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
res: Response<CrawlResponse>
) {
req.body = batchScrapeRequestSchema.parse(req.body);
const id = uuidv4();
await logCrawl(id, req.auth.team_id);
let { remainingCredits } = req.account;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if(!useDbAuthentication){
remainingCredits = Infinity;
}
const pageOptions = legacyScrapeOptions(req.body);
const sc: StoredCrawl = {
crawlerOptions: null,
pageOptions,
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
};
await saveCrawl(id, sc);
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if(req.body.urls.length > 1000){
// set base to 21
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
}
const jobs = req.body.urls.map((x) => {
const uuid = uuidv4();
return {
name: uuid,
data: {
url: x,
mode: "single_urls",
team_id: req.auth.team_id,
plan: req.auth.plan,
crawlerOptions: null,
pageOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
v1: true,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});
await lockURLs(
id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs);
const protocol = process.env.ENV === "local" ? req.protocol : "https";
return res.status(200).json({
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
});
}

View File

@ -44,7 +44,7 @@ export async function getJobs(ids: string[]) {
return jobs;
}
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>, isBatch = false) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ success: false, error: "Job not found" });
@ -113,7 +113,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
const data = doneJobs.map(x => x.returnvalue);
const protocol = process.env.ENV === "local" ? req.protocol : "https";
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`);
nextURL.searchParams.set("skip", (start + data.length).toString());

View File

@ -78,7 +78,7 @@ export async function crawlController(
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
} catch (e) {
Logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(

View File

@ -117,6 +117,7 @@ export const scrapeOptions = z.object({
}
).transform(val => val ? val.toUpperCase() : 'US')
}).optional(),
skipTlsVerification: z.boolean().default(false),
}).strict(strictMessage)
@ -141,19 +142,29 @@ export const scrapeRequestSchema = scrapeOptions.extend({
return obj;
});
// export type ScrapeRequest = {
// url: string;
// formats?: Format[];
// headers?: { [K: string]: string };
// includeTags?: string[];
// excludeTags?: string[];
// onlyMainContent?: boolean;
// timeout?: number;
// waitFor?: number;
// }
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export const batchScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(),
origin: z.string().optional().default("api"),
}).strict(strictMessage).refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
},
{
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
}
).transform((obj) => {
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
return { ...obj, timeout: 60000 };
}
return obj;
});
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
const crawlerOptions = z.object({
includePaths: z.string().array().default([]),
excludePaths: z.string().array().default([]),
@ -433,6 +444,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery
geolocation: x.geolocation,
skipTlsVerification: x.skipTlsVerification
};
}

View File

@ -3,7 +3,7 @@ import { redisConnection } from "../services/queue-service";
import { Logger } from "./logger";
export type StoredCrawl = {
originUrl: string;
originUrl?: string;
crawlerOptions: any;
pageOptions: any;
team_id: string;

View File

@ -54,6 +54,7 @@ export type PageOptions = {
geolocation?: {
country?: string;
};
skipTlsVerification?: boolean;
};
export type ExtractorOptions = {

View File

@ -112,7 +112,7 @@ export async function runWebScraper({
}
// remove docs with empty content
const filteredDocs = crawlerOptions.returnOnlyUrls
const filteredDocs = crawlerOptions?.returnOnlyUrls
? docs.map((doc) => {
if (doc.metadata.sourceURL) {
return { url: doc.metadata.sourceURL };

View File

@ -6,6 +6,8 @@ import {
cleanBefore24hCompleteJobsController,
queuesController,
} from "../controllers/v0/admin/queue";
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
import { wrap } from "./v1";
export const adminRouter = express.Router();
@ -33,3 +35,8 @@ adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
autoscalerController
);
adminRouter.post(
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
wrap(acucCacheClearController),
);

View File

@ -17,6 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel";
import { Logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status";
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
import { batchScrapeController } from "../controllers/v1/batch-scrape";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search";
@ -29,7 +30,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
return (req, res, next) => {
(async () => {
if (!minimum && req.body) {
minimum = (req.body as any)?.limit ?? 1;
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
}
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
req.acuc = chunk;
@ -94,7 +95,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
next();
}
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
export function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
return (req, res, next) => {
controller(req, res)
.catch(err => next(err))
@ -122,6 +123,15 @@ v1Router.post(
wrap(crawlController)
);
v1Router.post(
"/batch/scrape",
authMiddleware(RateLimiterMode.Crawl),
checkCreditsMiddleware(),
blocklistMiddleware,
idempotencyMiddleware,
wrap(batchScrapeController)
);
v1Router.post(
"/map",
authMiddleware(RateLimiterMode.Map),
@ -136,6 +146,13 @@ v1Router.get(
wrap(crawlStatusController)
);
v1Router.get(
"/batch/scrape/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
// Yes, it uses the same controller as the normal crawl status controller
wrap((req:any, res):any => crawlStatusController(req, res, true))
);
v1Router.get(
"/scrape/:jobId",
wrap(scrapeStatusController)

View File

@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
import { Logger } from "../../../src/lib/logger";
import https from "https";
export class WebCrawler {
private jobId: string;
private initialUrl: string;
@ -145,8 +145,14 @@ export class WebCrawler {
.slice(0, limit);
}
public async getRobotsTxt(): Promise<string> {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
let extraArgs = {};
if(skipTlsVerification) {
extraArgs["httpsAgent"] = new https.Agent({
rejectUnauthorized: false
});
}
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
return response.data;
}

View File

@ -594,6 +594,7 @@ export class WebScraperDataProvider {
atsv: options.pageOptions?.atsv ?? false,
actions: options.pageOptions?.actions ?? undefined,
geolocation: options.pageOptions?.geolocation ?? undefined,
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
};
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths =

View File

@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
waitFor = 0,
screenshot = false,
fullPageScreenshot = false,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" } },
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
fireEngineOptions = {},
headers,
options,
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
waitFor?: number;
screenshot?: boolean;
fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } };
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
@ -119,6 +119,7 @@ export async function scrapWithFireEngine({
atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [],
geolocation: pageOptions?.geolocation,
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
actions: actions,
},
{

View File

@ -157,6 +157,7 @@ export async function scrapSingleUrl(
atsv: pageOptions.atsv ?? false,
actions: pageOptions.actions ?? undefined,
geolocation: pageOptions.geolocation ?? undefined,
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
}
if (extractorOptions) {

View File

@ -329,7 +329,8 @@ async function processJob(job: Job, token: string) {
job.id as string,
data,
job.data.webhook,
job.data.v1
job.data.v1,
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
);
}
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
@ -339,7 +340,7 @@ async function processJob(job: Job, token: string) {
data,
job.data.webhook,
job.data.v1,
"crawl.page",
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
true
);
}
@ -365,7 +366,7 @@ async function processJob(job: Job, token: string) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (!job.data.sitemapped) {
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
@ -415,8 +416,6 @@ async function processJob(job: Job, token: string) {
}
if (await finishCrawl(job.data.crawl_id)) {
if (!job.data.v1) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
@ -439,7 +438,7 @@ async function processJob(job: Job, token: string) {
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
url: sc.originUrl,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
@ -469,7 +468,7 @@ async function processJob(job: Job, token: string) {
data,
job.data.webhook,
job.data.v1,
"crawl.completed"
job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed"
);
}
} else {
@ -487,7 +486,7 @@ async function processJob(job: Job, token: string) {
[],
job.data.webhook,
job.data.v1,
"crawl.completed"
job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed"
);
}
@ -499,7 +498,7 @@ async function processJob(job: Job, token: string) {
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
url: sc.originUrl,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
@ -556,7 +555,8 @@ async function processJob(job: Job, token: string) {
job.data.crawl_id ?? (job.id as string),
data,
job.data.webhook,
job.data.v1
job.data.v1,
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
);
}
// if (job.data.v1) {
@ -605,7 +605,7 @@ async function processJob(job: Job, token: string) {
docs: [],
time_taken: 0,
team_id: job.data.team_id,
mode: "crawl",
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
url: sc ? sc.originUrl : job.data.url,
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,

View File

@ -161,4 +161,4 @@ export type PlanType =
| "";
export type WebhookEventType = "crawl.page" | "crawl.started" | "crawl.completed" | "crawl.failed";
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";

View File

@ -145,6 +145,46 @@ watch.addEventListener("done", state => {
});
```
### Batch scraping multiple URLs
To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats.
```js
const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
formats: ['markdown', 'html'],
})
```
#### Asynchronous batch scrape
To initiate an asynchronous batch scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape.
```js
const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
```
#### Batch scrape with WebSockets
To use batch scrape with WebSockets, use the `batchScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
```js
// Batch scrape multiple URLs with WebSockets:
const watch = await app.batchScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
watch.addEventListener("document", doc => {
console.log("DOC", doc.detail);
});
watch.addEventListener("error", err => {
console.error("ERR", err.detail.error);
});
watch.addEventListener("done", state => {
console.log("DONE", state.detail.status);
});
```
## Error Handling
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.

View File

@ -1,6 +1,6 @@
{
"name": "firecrawl",
"version": "1.6.1",
"name": "@mendable/firecrawl-js",
"version": "1.7.1",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@ -154,6 +154,17 @@ export interface CrawlResponse {
error?: string;
}
/**
* Response interface for batch scrape operations.
* Defines the structure of the response received after initiating a crawl.
*/
export interface BatchScrapeResponse {
id?: string;
url?: string;
success: true;
error?: string;
}
/**
* Response interface for job status checks.
* Provides detailed status of a crawl job including progress and results.
@ -169,6 +180,21 @@ export interface CrawlStatusResponse {
data: FirecrawlDocument<undefined>[];
};
/**
* Response interface for batch scrape job status checks.
* Provides detailed status of a batch scrape job including progress and results.
*/
export interface BatchScrapeStatusResponse {
success: true;
status: "scraping" | "completed" | "failed" | "cancelled";
completed: number;
total: number;
creditsUsed: number;
expiresAt: Date;
next?: string;
data: FirecrawlDocument<undefined>[];
};
/**
* Parameters for mapping operations.
* Defines options for mapping URLs during a crawl.
@ -493,6 +519,144 @@ export default class FirecrawlApp {
return { success: false, error: "Internal server error." };
}
/**
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
* @param url - The URLs to scrape.
* @param params - Additional parameters for the scrape request.
* @param pollInterval - Time in seconds for job status checks.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns The response from the crawl operation.
*/
async batchScrapeUrls(
urls: string[],
params?: ScrapeParams,
pollInterval: number = 2,
idempotencyKey?: string
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,
jsonData,
headers
);
if (response.status === 200) {
const id: string = response.data.id;
return this.monitorJobStatus(id, headers, pollInterval);
} else {
this.handleError(response, "start batch scrape job");
}
} catch (error: any) {
if (error.response?.data?.error) {
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
} else {
throw new FirecrawlError(error.message, 500);
}
}
return { success: false, error: "Internal server error." };
}
async asyncBatchScrapeUrls(
urls: string[],
params?: ScrapeParams,
idempotencyKey?: string
): Promise<BatchScrapeResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,
jsonData,
headers
);
if (response.status === 200) {
return response.data;
} else {
this.handleError(response, "start batch scrape job");
}
} catch (error: any) {
if (error.response?.data?.error) {
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
} else {
throw new FirecrawlError(error.message, 500);
}
}
return { success: false, error: "Internal server error." };
}
/**
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
* @param urls - The URL to scrape.
* @param params - Additional parameters for the scrape request.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns A CrawlWatcher instance to monitor the crawl job.
*/
async batchScrapeUrlsAndWatch(
urls: string[],
params?: ScrapeParams,
idempotencyKey?: string,
) {
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
if (crawl.success && crawl.id) {
const id = crawl.id;
return new CrawlWatcher(id, this);
}
throw new FirecrawlError("Batch scrape job failed to start", 400);
}
/**
* Checks the status of a batch scrape job using the Firecrawl API.
* @param id - The ID of the batch scrape operation.
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
* @returns The response containing the job status.
*/
async checkBatchScrapeStatus(id?: string, getAllData = false): Promise<BatchScrapeStatusResponse | ErrorResponse> {
if (!id) {
throw new FirecrawlError("No batch scrape ID provided", 400);
}
const headers: AxiosRequestHeaders = this.prepareHeaders();
try {
const response: AxiosResponse = await this.getRequest(
`${this.apiUrl}/v1/batch/scrape/${id}`,
headers
);
if (response.status === 200) {
let allData = response.data.data;
if (getAllData && response.data.status === "completed") {
let statusData = response.data
if ("data" in statusData) {
let data = statusData.data;
while ('next' in statusData) {
statusData = (await this.getRequest(statusData.next, headers)).data;
data = data.concat(statusData.data);
}
allData = data;
}
}
return ({
success: response.data.success,
status: response.data.status,
total: response.data.total,
completed: response.data.completed,
creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt),
next: response.data.next,
data: allData,
error: response.data.error,
})
} else {
this.handleError(response, "check batch scrape status");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);
}
return { success: false, error: "Internal server error." };
}
/**
* Prepares the headers for an API request.
* @param idempotencyKey - Optional key to ensure idempotency.

View File

@ -9,7 +9,7 @@
"version": "1.0.0",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^1.0.3",
"@mendable/firecrawl-js": "^1.7.0-beta.2",
"axios": "^1.6.8",
"firecrawl": "^1.2.0",
"ts-node": "^10.9.2",
@ -423,31 +423,17 @@
}
},
"node_modules/@mendable/firecrawl-js": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz",
"integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==",
"version": "1.7.0-beta.2",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.7.0-beta.2.tgz",
"integrity": "sha512-6L5r6BOuMPjLgSDq85xs2IpVgX9Tb/EdesKZvmtFucoaFZzIsgCQb0ZfSvwaRmqTkj53o+7eSgCcm+gsnR/yeQ==",
"dependencies": {
"axios": "^1.6.8",
"dotenv": "^16.4.5",
"isows": "^1.0.4",
"typescript-event-target": "^1.1.1",
"uuid": "^9.0.1",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
}
},
"node_modules/@mendable/firecrawl-js/node_modules/uuid": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
"funding": [
"https://github.com/sponsors/broofa",
"https://github.com/sponsors/ctavan"
],
"bin": {
"uuid": "dist/bin/uuid"
}
},
"node_modules/@tsconfig/node10": {
"version": "1.0.11",
"resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz",

View File

@ -11,7 +11,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^1.0.3",
"@mendable/firecrawl-js": "1.7.1",
"axios": "^1.6.8",
"firecrawl": "^1.2.0",
"ts-node": "^10.9.2",

View File

@ -149,6 +149,69 @@ async def start_crawl_and_watch():
await start_crawl_and_watch()
```
### Scraping multiple URLs in batch
To batch scrape multiple URLs, use the `batch_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats.
```python
idempotency_key = str(uuid.uuid4()) # optional idempotency key
batch_scrape_result = app.batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key)
print(batch_scrape_result)
```
### Asynchronous batch scrape
To run a batch scrape asynchronously, use the `async_batch_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
```python
batch_scrape_result = app.async_batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
print(batch_scrape_result)
```
### Checking batch scrape status
To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_job` method. It takes the job ID as a parameter and returns the current status of the batch scrape job.
```python
id = batch_scrape_result['id']
status = app.check_batch_scrape_job(id)
```
### Batch scrape with WebSockets
To use batch scrape with WebSockets, use the `batch_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
```python
# inside an async function...
nest_asyncio.apply()
# Define event handlers
def on_document(detail):
print("DOC", detail)
def on_error(detail):
print("ERR", detail['error'])
def on_done(detail):
print("DONE", detail['status'])
# Function to start the crawl and watch process
async def start_crawl_and_watch():
# Initiate the crawl job and get the watcher
watcher = app.batch_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
# Add event listeners
watcher.add_event_listener("document", on_document)
watcher.add_event_listener("error", on_error)
watcher.add_event_listener("done", on_done)
# Start the watcher
await watcher.connect()
# Run the event loop
await start_crawl_and_watch()
```
## Error Handling
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.

View File

@ -9,6 +9,23 @@ app = FirecrawlApp(api_key="fc-")
scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown'])
# Test batch scrape
urls = ['https://example.com', 'https://docs.firecrawl.dev']
batch_scrape_params = {
'formats': ['markdown', 'html'],
}
# Synchronous batch scrape
batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
print("Synchronous Batch Scrape Result:")
print(batch_result['data'][0]['markdown'])
# Asynchronous batch scrape
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
print("\nAsynchronous Batch Scrape Result:")
print(async_batch_result)
# Crawl a website:
idempotency_key = str(uuid.uuid4()) # optional idempotency key
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp
__version__ = "1.3.1"
__version__ = "1.4.0"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -275,6 +275,123 @@ class FirecrawlApp:
else:
self._handle_error(response, 'map')
def batch_scrape_urls(self, urls: list[str],
params: Optional[Dict[str, Any]] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None) -> Any:
"""
Initiate a batch scrape job for the specified URLs using the Firecrawl API.
Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
- 'success' (bool): Indicates if the batch scrape was successful.
- 'status' (str): The final status of the batch scrape job (e.g., 'completed').
- 'completed' (int): Number of scraped pages that completed.
- 'total' (int): Total number of scraped pages.
- 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
- 'data' (List[Dict]): List of all the scraped pages.
Raises:
Exception: If the batch scrape job initiation or monitoring fails.
"""
endpoint = f'/v1/batch/scrape'
headers = self._prepare_headers(idempotency_key)
json_data = {'urls': urls}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
id = response.json().get('id')
return self._monitor_job_status(id, headers, poll_interval)
else:
self._handle_error(response, 'start batch scrape job')
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
"""
Initiate a crawl job asynchronously.
Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
- 'success' (bool): Indicates if the batch scrape initiation was successful.
- 'id' (str): The unique identifier for the batch scrape job.
- 'url' (str): The URL to check the status of the batch scrape job.
"""
endpoint = f'/v1/batch/scrape'
headers = self._prepare_headers(idempotency_key)
json_data = {'urls': urls}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, 'start batch scrape job')
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
"""
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
"""
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
if crawl_response['success'] and 'id' in crawl_response:
return CrawlWatcher(crawl_response['id'], self)
else:
raise Exception("Batch scrape job failed to start")
def check_batch_scrape_status(self, id: str) -> Any:
"""
Check the status of a batch scrape job using the Firecrawl API.
Args:
id (str): The ID of the batch scrape job.
Returns:
Any: The status of the batch scrape job.
Raises:
Exception: If the status check request fails.
"""
endpoint = f'/v1/batch/scrape/{id}'
headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}{endpoint}', headers)
if response.status_code == 200:
data = response.json()
return {
'success': True,
'status': data.get('status'),
'total': data.get('total'),
'completed': data.get('completed'),
'creditsUsed': data.get('creditsUsed'),
'expiresAt': data.get('expiresAt'),
'next': data.get('next'),
'data': data.get('data'),
'error': data.get('error')
}
else:
self._handle_error(response, 'check batch scrape status')
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
"""
Prepare the headers for API requests.

View File

@ -0,0 +1,164 @@
import os
from firecrawl import FirecrawlApp
import json
from dotenv import load_dotenv
import anthropic
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
# Load environment variables
load_dotenv()
# Retrieve API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
# Initialize the FirecrawlApp and OpenAI client
app = FirecrawlApp(api_key=firecrawl_api_key)
client = anthropic.Anthropic(api_key=anthropic_api_key)
# Find the page that most likely contains the objective
def find_relevant_page_via_map(objective, url, app, client):
try:
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
map_prompt = f"""
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
"""
print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
completion = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1000,
temperature=0,
system="You are an expert web crawler. Respond with the best search parameter.",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": map_prompt
}
]
}
]
)
map_search_parameter = completion.content[0].text
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
map_website = app.map_url(url, params={"search": map_search_parameter})
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
return map_website['links']
except Exception as e:
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
return None
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
def find_objective_in_top_pages(map_website, objective, app, client):
try:
# Get top 2 links from the map result
top_links = map_website[:2]
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
# Scrape the pages in batch
batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})
print(f"{Colors.GREEN}Batch page scraping completed successfully.{Colors.RESET}")
for scrape_result in batch_scrape_result['data']:
# Check if objective is met
check_prompt = f"""
Given the following scraped content and objective, determine if the objective is met.
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
If the objective is not met with confidence, respond with 'Objective not met'.
Objective: {objective}
Scraped content: {scrape_result['markdown']}
Remember:
1. Only return JSON if you are confident the objective is fully met.
2. Keep the JSON structure as simple and flat as possible.
3. Do not include any explanations or markdown formatting in your response.
"""
completion = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1000,
temperature=0,
system="You are an expert web crawler. Respond with the relevant information in JSON format.",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": check_prompt
}
]
}
]
)
result = completion.content[0].text
if result != "Objective not met":
print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
try:
return json.loads(result)
except json.JSONDecodeError:
print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
else:
print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
return None
except Exception as e:
print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
return None
# Main function to execute the process
def main():
# Get user input
url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
if not url.strip():
url = "https://www.firecrawl.dev/"
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
if not objective.strip():
objective = "find me the pricing plans"
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
# Find the relevant page
map_website = find_relevant_page_via_map(objective, url, app, client)
print(map_website)
if map_website:
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}")
# Find objective in top pages
result = find_objective_in_top_pages(map_website, objective, app, client)
if result:
print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
else:
print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
else:
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
if __name__ == "__main__":
main()