Nick: from bulk to batch

This commit is contained in:
Nicolas 2024-10-23 15:37:24 -03:00
parent 70c4e7c334
commit d8abd15716
8 changed files with 84 additions and 83 deletions

View File

@ -1,8 +1,8 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
BulkScrapeRequest,
bulkScrapeRequestSchema,
BatchScrapeRequest,
batchScrapeRequestSchema,
CrawlResponse,
legacyScrapeOptions,
RequestWithAuth,
@ -17,11 +17,11 @@ import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { getJobPriority } from "../../lib/job-priority";
export async function bulkScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BulkScrapeRequest>,
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
res: Response<CrawlResponse>
) {
req.body = bulkScrapeRequestSchema.parse(req.body);
req.body = batchScrapeRequestSchema.parse(req.body);
const id = uuidv4();
@ -92,7 +92,7 @@ export async function bulkScrapeController(
return res.status(200).json({
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/bulk/scrape/${id}`,
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
});
}

View File

@ -44,7 +44,7 @@ export async function getJobs(ids: string[]) {
return jobs;
}
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>, isBatch = false) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ success: false, error: "Job not found" });
@ -113,7 +113,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
const data = doneJobs.map(x => x.returnvalue);
const protocol = process.env.ENV === "local" ? req.protocol : "https";
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
const nextURL = new URL(`${protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`);
nextURL.searchParams.set("skip", (start + data.length).toString());

View File

@ -144,7 +144,7 @@ export const scrapeRequestSchema = scrapeOptions.extend({
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export const bulkScrapeRequestSchema = scrapeOptions.extend({
export const batchScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(),
origin: z.string().optional().default("api"),
}).strict(strictMessage).refine(
@ -163,7 +163,7 @@ export const bulkScrapeRequestSchema = scrapeOptions.extend({
return obj;
});
export type BulkScrapeRequest = z.infer<typeof bulkScrapeRequestSchema>;
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
const crawlerOptions = z.object({
includePaths: z.string().array().default([]),

View File

@ -17,7 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel";
import { Logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status";
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
import { bulkScrapeController } from "../controllers/v1/bulk-scrape";
import { batchScrapeController } from "../controllers/v1/batch-scrape";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search";
@ -124,12 +124,12 @@ v1Router.post(
);
v1Router.post(
"/bulk/scrape",
"/batch/scrape",
authMiddleware(RateLimiterMode.Crawl),
checkCreditsMiddleware(),
blocklistMiddleware,
idempotencyMiddleware,
wrap(bulkScrapeController)
wrap(batchScrapeController)
);
v1Router.post(
@ -147,9 +147,10 @@ v1Router.get(
);
v1Router.get(
"/bulk/scrape/:jobId",
"/batch/scrape/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(crawlStatusController)
// Yes, it uses the same controller as the normal crawl status controller
wrap((req:any, res):any => crawlStatusController(req, res, true))
);
v1Router.get(

View File

@ -145,32 +145,32 @@ watch.addEventListener("done", state => {
});
```
### Bulk scraping multiple URLs
### Batch scraping multiple URLs
To bulk scrape multiple URLs with error handling, use the `bulkScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats.
To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats.
```js
const bulkScrapeResponse = await app.bulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
formats: ['markdown', 'html'],
})
```
#### Asynchronous bulk scrape
#### Asynchronous batch scrape
To initiate an asynchronous bulk scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the bulk scrape.
To initiate an asynchronous batch scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape.
```js
const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
```
#### Bulk scrape with WebSockets
#### Batch scrape with WebSockets
To use bulk scrape with WebSockets, use the `bulkScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the bulk scrape job, such as the output formats.
To use batch scrape with WebSockets, use the `batchScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
```js
// Bulk scrape multiple URLs with WebSockets:
const watch = await app.bulkScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
// Batch scrape multiple URLs with WebSockets:
const watch = await app.batchScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
watch.addEventListener("document", doc => {
console.log("DOC", doc.detail);

View File

@ -494,14 +494,14 @@ export default class FirecrawlApp {
}
/**
* Initiates a bulk scrape job for multiple URLs using the Firecrawl API.
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
* @param url - The URLs to scrape.
* @param params - Additional parameters for the scrape request.
* @param pollInterval - Time in seconds for job status checks.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns The response from the crawl operation.
*/
async bulkScrapeUrls(
async batchScrapeUrls(
urls: string[],
params?: ScrapeParams,
pollInterval: number = 2,
@ -511,7 +511,7 @@ export default class FirecrawlApp {
let jsonData: any = { urls, ...(params ?? {}) };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/bulk/scrape`,
this.apiUrl + `/v1/batch/scrape`,
jsonData,
headers
);
@ -519,7 +519,7 @@ export default class FirecrawlApp {
const id: string = response.data.id;
return this.monitorJobStatus(id, headers, pollInterval);
} else {
this.handleError(response, "start bulk scrape job");
this.handleError(response, "start batch scrape job");
}
} catch (error: any) {
if (error.response?.data?.error) {
@ -531,7 +531,7 @@ export default class FirecrawlApp {
return { success: false, error: "Internal server error." };
}
async asyncBulkScrapeUrls(
async asyncBatchScrapeUrls(
urls: string[],
params?: ScrapeParams,
idempotencyKey?: string
@ -540,14 +540,14 @@ export default class FirecrawlApp {
let jsonData: any = { urls, ...(params ?? {}) };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/bulk/scrape`,
this.apiUrl + `/v1/batch/scrape`,
jsonData,
headers
);
if (response.status === 200) {
return response.data;
} else {
this.handleError(response, "start bulk scrape job");
this.handleError(response, "start batch scrape job");
}
} catch (error: any) {
if (error.response?.data?.error) {
@ -560,42 +560,42 @@ export default class FirecrawlApp {
}
/**
* Initiates a bulk scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
* @param urls - The URL to scrape.
* @param params - Additional parameters for the scrape request.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns A CrawlWatcher instance to monitor the crawl job.
*/
async bulkScrapeUrlsAndWatch(
async batchScrapeUrlsAndWatch(
urls: string[],
params?: ScrapeParams,
idempotencyKey?: string,
) {
const crawl = await this.asyncBulkScrapeUrls(urls, params, idempotencyKey);
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
if (crawl.success && crawl.id) {
const id = crawl.id;
return new CrawlWatcher(id, this);
}
throw new FirecrawlError("Bulk scrape job failed to start", 400);
throw new FirecrawlError("Batch scrape job failed to start", 400);
}
/**
* Checks the status of a bulk scrape job using the Firecrawl API.
* @param id - The ID of the bulk scrape operation.
* Checks the status of a batch scrape job using the Firecrawl API.
* @param id - The ID of the batch scrape operation.
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
* @returns The response containing the job status.
*/
async checkBulkScrapeStatus(id?: string, getAllData = false): Promise<CrawlStatusResponse | ErrorResponse> {
async checkBatchScrapeStatus(id?: string, getAllData = false): Promise<CrawlStatusResponse | ErrorResponse> {
if (!id) {
throw new FirecrawlError("No bulk scrape ID provided", 400);
throw new FirecrawlError("No batch scrape ID provided", 400);
}
const headers: AxiosRequestHeaders = this.prepareHeaders();
try {
const response: AxiosResponse = await this.getRequest(
`${this.apiUrl}/v1/bulk/scrape/${id}`,
`${this.apiUrl}/v1/batch/scrape/${id}`,
headers
);
if (response.status === 200) {
@ -623,7 +623,7 @@ export default class FirecrawlApp {
error: response.data.error,
})
} else {
this.handleError(response, "check bulk scrape status");
this.handleError(response, "check batch scrape status");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);

View File

@ -149,37 +149,37 @@ async def start_crawl_and_watch():
await start_crawl_and_watch()
```
### Scraping multiple URLs in bulk
### Scraping multiple URLs in batch
To bulk scrape multiple URLs, use the `bulk_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats.
To batch scrape multiple URLs, use the `batch_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats.
```python
idempotency_key = str(uuid.uuid4()) # optional idempotency key
bulk_scrape_result = app.bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key)
print(bulk_scrape_result)
batch_scrape_result = app.batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key)
print(batch_scrape_result)
```
### Asynchronous bulk scrape
### Asynchronous batch scrape
To run a bulk scrape asynchronously, use the `async_bulk_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
To run a batch scrape asynchronously, use the `async_batch_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
```python
bulk_scrape_result = app.async_bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
print(bulk_scrape_result)
batch_scrape_result = app.async_batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
print(batch_scrape_result)
```
### Checking bulk scrape status
### Checking batch scrape status
To check the status of an asynchronous bulk scrape job, use the `check_bulk_scrape_job` method. It takes the job ID as a parameter and returns the current status of the bulk scrape job.
To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_job` method. It takes the job ID as a parameter and returns the current status of the batch scrape job.
```python
id = bulk_scrape_result['id']
status = app.check_bulk_scrape_job(id)
id = batch_scrape_result['id']
status = app.check_batch_scrape_job(id)
```
### Bulk scrape with WebSockets
### Batch scrape with WebSockets
To use bulk scrape with WebSockets, use the `bulk_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
To use batch scrape with WebSockets, use the `batch_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
```python
# inside an async function...
@ -198,7 +198,7 @@ def on_done(detail):
# Function to start the crawl and watch process
async def start_crawl_and_watch():
# Initiate the crawl job and get the watcher
watcher = app.bulk_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
watcher = app.batch_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
# Add event listeners
watcher.add_event_listener("document", on_document)

View File

@ -275,12 +275,12 @@ class FirecrawlApp:
else:
self._handle_error(response, 'map')
def bulk_scrape_urls(self, urls: list[str],
def batch_scrape_urls(self, urls: list[str],
params: Optional[Dict[str, Any]] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None) -> Any:
"""
Initiate a bulk scrape job for the specified URLs using the Firecrawl API.
Initiate a batch scrape job for the specified URLs using the Firecrawl API.
Args:
urls (list[str]): The URLs to scrape.
@ -290,18 +290,18 @@ class FirecrawlApp:
Returns:
Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
- 'success' (bool): Indicates if the bulk scrape was successful.
- 'status' (str): The final status of the bulk scrape job (e.g., 'completed').
- 'success' (bool): Indicates if the batch scrape was successful.
- 'status' (str): The final status of the batch scrape job (e.g., 'completed').
- 'completed' (int): Number of scraped pages that completed.
- 'total' (int): Total number of scraped pages.
- 'creditsUsed' (int): Estimated number of API credits used for this bulk scrape.
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the bulk scrape data expires.
- 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
- 'data' (List[Dict]): List of all the scraped pages.
Raises:
Exception: If the bulk scrape job initiation or monitoring fails.
Exception: If the batch scrape job initiation or monitoring fails.
"""
endpoint = f'/v1/bulk/scrape'
endpoint = f'/v1/batch/scrape'
headers = self._prepare_headers(idempotency_key)
json_data = {'urls': urls}
if params:
@ -312,10 +312,10 @@ class FirecrawlApp:
return self._monitor_job_status(id, headers, poll_interval)
else:
self._handle_error(response, 'start bulk scrape job')
self._handle_error(response, 'start batch scrape job')
def async_bulk_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
"""
Initiate a crawl job asynchronously.
@ -325,12 +325,12 @@ class FirecrawlApp:
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Dict[str, Any]: A dictionary containing the bulk scrape initiation response. The structure includes:
- 'success' (bool): Indicates if the bulk scrape initiation was successful.
- 'id' (str): The unique identifier for the bulk scrape job.
- 'url' (str): The URL to check the status of the bulk scrape job.
Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
- 'success' (bool): Indicates if the batch scrape initiation was successful.
- 'id' (str): The unique identifier for the batch scrape job.
- 'url' (str): The URL to check the status of the batch scrape job.
"""
endpoint = f'/v1/bulk/scrape'
endpoint = f'/v1/batch/scrape'
headers = self._prepare_headers(idempotency_key)
json_data = {'urls': urls}
if params:
@ -339,11 +339,11 @@ class FirecrawlApp:
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, 'start bulk scrape job')
self._handle_error(response, 'start batch scrape job')
def bulk_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
"""
Initiate a bulk scrape job and return a CrawlWatcher to monitor the job via WebSocket.
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
Args:
urls (list[str]): The URLs to scrape.
@ -351,28 +351,28 @@ class FirecrawlApp:
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
CrawlWatcher: An instance of CrawlWatcher to monitor the bulk scrape job.
CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
"""
crawl_response = self.async_bulk_scrape_urls(urls, params, idempotency_key)
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
if crawl_response['success'] and 'id' in crawl_response:
return CrawlWatcher(crawl_response['id'], self)
else:
raise Exception("Bulk scrape job failed to start")
raise Exception("Batch scrape job failed to start")
def check_bulk_scrape_status(self, id: str) -> Any:
def check_batch_scrape_status(self, id: str) -> Any:
"""
Check the status of a bulk scrape job using the Firecrawl API.
Check the status of a batch scrape job using the Firecrawl API.
Args:
id (str): The ID of the bulk scrape job.
id (str): The ID of the batch scrape job.
Returns:
Any: The status of the bulk scrape job.
Any: The status of the batch scrape job.
Raises:
Exception: If the status check request fails.
"""
endpoint = f'/v1/bulk/scrape/{id}'
endpoint = f'/v1/batch/scrape/{id}'
headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}{endpoint}', headers)
@ -390,7 +390,7 @@ class FirecrawlApp:
'error': data.get('error')
}
else:
self._handle_error(response, 'check bulk scrape status')
self._handle_error(response, 'check batch scrape status')
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
"""