feat(bulk/scrape): add node and python SDK integration + docs

2024-11-15 19:22:19 +08:00 · 2024-10-22 18:58:48 +02:00 · 2024-10-22 18:58:48 +02:00 · 3cd328cf93
commit 3cd328cf93
parent 03b37998fd
4 changed files with 358 additions and 0 deletions
--- a/apps/js-sdk/firecrawl/README.md
+++ b/apps/js-sdk/firecrawl/README.md
@ -145,6 +145,46 @@ watch.addEventListener("done", state => {
 });
 ```

+### Bulk scraping multiple URLs
+
+To bulk scrape multiple URLs with error handling, use the `bulkScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats.
+
+```js
+const bulkScrapeResponse = await app.bulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
+  formats: ['markdown', 'html'],
+})
+```
+
+
+#### Asynchronous bulk scrape
+
+To initiate an asynchronous bulk scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the bulk scrape.
+
+```js
+const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
+```
+
+#### Bulk scrape with WebSockets
+
+To use bulk scrape with WebSockets, use the `bulkScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the bulk scrape job, such as the output formats.
+
+```js
+// Bulk scrape multiple URLs with WebSockets:
+const watch = await app.bulkScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
+
+watch.addEventListener("document", doc => {
+ console.log("DOC", doc.detail);
+});
+
+watch.addEventListener("error", err => {
+ console.error("ERR", err.detail.error);
+});
+
+watch.addEventListener("done", state => {
+ console.log("DONE", state.detail.status);
+});
+```
+
 ## Error Handling

 The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -493,6 +493,144 @@ export default class FirecrawlApp {
    return { success: false, error: "Internal server error." };
  }

+  /**
+   * Initiates a bulk scrape job for multiple URLs using the Firecrawl API.
+   * @param url - The URLs to scrape.
+   * @param params - Additional parameters for the scrape request.
+   * @param pollInterval - Time in seconds for job status checks.
+   * @param idempotencyKey - Optional idempotency key for the request.
+   * @returns The response from the crawl operation.
+   */
+  async bulkScrapeUrls(
+    urls: string[],
+    params?: ScrapeParams,
+    pollInterval: number = 2,
+    idempotencyKey?: string
+  ): Promise<CrawlStatusResponse | ErrorResponse> {
+    const headers = this.prepareHeaders(idempotencyKey);
+    let jsonData: any = { urls, ...(params ?? {}) };
+    try {
+      const response: AxiosResponse = await this.postRequest(
+        this.apiUrl + `/v1/bulk/scrape`,
+        jsonData,
+        headers
+      );
+      if (response.status === 200) {
+        const id: string = response.data.id;
+        return this.monitorJobStatus(id, headers, pollInterval);
+      } else {
+        this.handleError(response, "start bulk scrape job");
+      }
+    } catch (error: any) {
+      if (error.response?.data?.error) {
+        throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
+      } else {
+        throw new FirecrawlError(error.message, 500);
+      }
+    }
+    return { success: false, error: "Internal server error." };
+  }
+
+  async asyncBulkScrapeUrls(
+    urls: string[],
+    params?: ScrapeParams,
+    idempotencyKey?: string
+  ): Promise<CrawlResponse | ErrorResponse> {
+    const headers = this.prepareHeaders(idempotencyKey);
+    let jsonData: any = { urls, ...(params ?? {}) };
+    try {
+      const response: AxiosResponse = await this.postRequest(
+        this.apiUrl + `/v1/bulk/scrape`,
+        jsonData,
+        headers
+      );
+      if (response.status === 200) {
+        return response.data;
+      } else {
+        this.handleError(response, "start bulk scrape job");
+      }
+    } catch (error: any) {
+      if (error.response?.data?.error) {
+        throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
+      } else {
+        throw new FirecrawlError(error.message, 500);
+      }
+    }
+    return { success: false, error: "Internal server error." };
+  }
+
+  /**
+   * Initiates a bulk scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
+   * @param urls - The URL to scrape.
+   * @param params - Additional parameters for the scrape request.
+   * @param idempotencyKey - Optional idempotency key for the request.
+   * @returns A CrawlWatcher instance to monitor the crawl job.
+   */
+  async bulkScrapeUrlsAndWatch(
+    urls: string[],
+    params?: ScrapeParams,
+    idempotencyKey?: string,
+  ) {
+    const crawl = await this.asyncBulkScrapeUrls(urls, params, idempotencyKey);
+
+    if (crawl.success && crawl.id) {
+      const id = crawl.id;
+      return new CrawlWatcher(id, this);
+    }
+
+    throw new FirecrawlError("Bulk scrape job failed to start", 400);
+  }
+
+  /**
+   * Checks the status of a bulk scrape job using the Firecrawl API.
+   * @param id - The ID of the bulk scrape operation.
+   * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
+   * @returns The response containing the job status.
+   */
+  async checkBulkScrapeStatus(id?: string, getAllData = false): Promise<CrawlStatusResponse | ErrorResponse> {
+    if (!id) {
+      throw new FirecrawlError("No bulk scrape ID provided", 400);
+    }
+
+    const headers: AxiosRequestHeaders = this.prepareHeaders();
+    try {
+      const response: AxiosResponse = await this.getRequest(
+        `${this.apiUrl}/v1/bulk/scrape/${id}`,
+        headers
+      );
+      if (response.status === 200) {
+        let allData = response.data.data;
+        if (getAllData && response.data.status === "completed") {
+          let statusData = response.data
+          if ("data" in statusData) {
+            let data = statusData.data;
+            while ('next' in statusData) {
+              statusData = (await this.getRequest(statusData.next, headers)).data;
+              data = data.concat(statusData.data);
+            }
+            allData = data;
+          }
+        }
+        return ({
+          success: response.data.success,
+          status: response.data.status,
+          total: response.data.total,
+          completed: response.data.completed,
+          creditsUsed: response.data.creditsUsed,
+          expiresAt: new Date(response.data.expiresAt),
+          next: response.data.next,
+          data: allData,
+          error: response.data.error,
+        })
+      } else {
+        this.handleError(response, "check bulk scrape status");
+      }
+    } catch (error: any) {
+      throw new FirecrawlError(error.message, 500);
+    }
+    return { success: false, error: "Internal server error." };
+  }
+
  /**
   * Prepares the headers for an API request.
   * @param idempotencyKey - Optional key to ensure idempotency.
--- a/apps/python-sdk/README.md
+++ b/apps/python-sdk/README.md
@ -149,6 +149,69 @@ async def start_crawl_and_watch():
 await start_crawl_and_watch()
 ```

+### Scraping multiple URLs in bulk
+
+To bulk scrape multiple URLs, use the `bulk_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats.
+
+```python
+idempotency_key = str(uuid.uuid4()) # optional idempotency key
+bulk_scrape_result = app.bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key)
+print(bulk_scrape_result)
+```
+
+### Asynchronous bulk scrape
+
+To run a bulk scrape asynchronously, use the `async_bulk_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
+
+```python
+bulk_scrape_result = app.async_bulk_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
+print(bulk_scrape_result)
+```
+
+### Checking bulk scrape status
+
+To check the status of an asynchronous bulk scrape job, use the `check_bulk_scrape_job` method. It takes the job ID as a parameter and returns the current status of the bulk scrape job.
+
+```python
+id = bulk_scrape_result['id']
+status = app.check_bulk_scrape_job(id)
+```
+
+### Bulk scrape with WebSockets
+
+To use bulk scrape with WebSockets, use the `bulk_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
+
+```python
+# inside an async function...
+nest_asyncio.apply()
+
+# Define event handlers
+def on_document(detail):
+    print("DOC", detail)
+
+def on_error(detail):
+    print("ERR", detail['error'])
+
+def on_done(detail):
+    print("DONE", detail['status'])
+
+# Function to start the crawl and watch process
+async def start_crawl_and_watch():
+    # Initiate the crawl job and get the watcher
+    watcher = app.bulk_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
+
+    # Add event listeners
+    watcher.add_event_listener("document", on_document)
+    watcher.add_event_listener("error", on_error)
+    watcher.add_event_listener("done", on_done)
+
+    # Start the watcher
+    await watcher.connect()
+
+# Run the event loop
+await start_crawl_and_watch()
+```
+
 ## Error Handling

 The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -271,6 +271,123 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'map')

+    def bulk_scrape_urls(self, urls: list[str],
+                  params: Optional[Dict[str, Any]] = None,
+                  poll_interval: Optional[int] = 2,
+                  idempotency_key: Optional[str] = None) -> Any:
+        """
+        Initiate a bulk scrape job for the specified URLs using the Firecrawl API.
+
+        Args:
+            urls (list[str]): The URLs to scrape.
+            params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
+            poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
+            idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
+                - 'success' (bool): Indicates if the bulk scrape was successful.
+                - 'status' (str): The final status of the bulk scrape job (e.g., 'completed').
+                - 'completed' (int): Number of scraped pages that completed.
+                - 'total' (int): Total number of scraped pages.
+                - 'creditsUsed' (int): Estimated number of API credits used for this bulk scrape.
+                - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the bulk scrape data expires.
+                - 'data' (List[Dict]): List of all the scraped pages.
+
+        Raises:
+            Exception: If the bulk scrape job initiation or monitoring fails.
+        """
+        endpoint = f'/v1/bulk/scrape'
+        headers = self._prepare_headers(idempotency_key)
+        json_data = {'urls': urls}
+        if params:
+            json_data.update(params)
+        response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
+        if response.status_code == 200:
+            id = response.json().get('id')
+            return self._monitor_job_status(id, headers, poll_interval)
+
+        else:
+            self._handle_error(response, 'start bulk scrape job')
+
+
+    def async_bulk_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Initiate a crawl job asynchronously.
+
+        Args:
+            urls (list[str]): The URLs to scrape.
+            params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
+            idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the bulk scrape initiation response. The structure includes:
+                - 'success' (bool): Indicates if the bulk scrape initiation was successful.
+                - 'id' (str): The unique identifier for the bulk scrape job.
+                - 'url' (str): The URL to check the status of the bulk scrape job.
+        """
+        endpoint = f'/v1/bulk/scrape'
+        headers = self._prepare_headers(idempotency_key)
+        json_data = {'urls': urls}
+        if params:
+            json_data.update(params)
+        response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            self._handle_error(response, 'start bulk scrape job')
+    
+    def bulk_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
+        """
+        Initiate a bulk scrape job and return a CrawlWatcher to monitor the job via WebSocket.
+
+        Args:
+            urls (list[str]): The URLs to scrape.
+            params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
+            idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
+
+        Returns:
+            CrawlWatcher: An instance of CrawlWatcher to monitor the bulk scrape job.
+        """
+        crawl_response = self.async_bulk_scrape_urls(urls, params, idempotency_key)
+        if crawl_response['success'] and 'id' in crawl_response:
+            return CrawlWatcher(crawl_response['id'], self)
+        else:
+            raise Exception("Bulk scrape job failed to start")
+    
+    def check_bulk_scrape_status(self, id: str) -> Any:
+        """
+        Check the status of a bulk scrape job using the Firecrawl API.
+
+        Args:
+            id (str): The ID of the bulk scrape job.
+
+        Returns:
+            Any: The status of the bulk scrape job.
+
+        Raises:
+            Exception: If the status check request fails.
+        """
+        endpoint = f'/v1/bulk/scrape/{id}'
+
+        headers = self._prepare_headers()
+        response = self._get_request(f'{self.api_url}{endpoint}', headers)
+        if response.status_code == 200:
+            data = response.json()
+            return {
+                'success': True,
+                'status': data.get('status'),
+                'total': data.get('total'),
+                'completed': data.get('completed'),
+                'creditsUsed': data.get('creditsUsed'),
+                'expiresAt': data.get('expiresAt'),
+                'next': data.get('next'),
+                'data': data.get('data'),
+                'error': data.get('error')
+            }
+        else:
+            self._handle_error(response, 'check bulk scrape status')
+
    def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
        """
        Prepare the headers for API requests.