From 27c5a93f4ed6f4c1eb393cfc44e0747c1934b556 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Date: Fri, 8 Nov 2024 15:39:38 -0300 Subject: [PATCH 1/2] added next handler for python sdk (js is ok) --- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 78 ++++++++++++++++++++------ 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 23d22cf4..eee37690 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.4.0" +__version__ = "1.4.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 1986ddd2..c2693c3d 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -189,17 +189,38 @@ class FirecrawlApp: headers = self._prepare_headers() response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: - data = response.json() + status_data = response.json() + if status_data['status'] == 'completed': + if 'data' in status_data: + data = status_data['data'] + while 'next' in status_data: + next_url = status_data.get('next') + if not next_url: + logger.warning("Expected 'next' URL is missing.") + break + try: + status_response = self._get_request(next_url, headers) + if status_response.status_code != 200: + logger.error(f"Failed to fetch next page: {status_response.status_code}") + break + status_data = status_response.json() + data.extend(status_data.get('data', [])) + except Exception as e: + logger.error(f"Error during pagination request: {e}") + break + status_data.pop('next', None) + status_data['data'] = data + return { 'success': True, - 'status': data.get('status'), - 'total': data.get('total'), - 'completed': data.get('completed'), - 'creditsUsed': data.get('creditsUsed'), - 'expiresAt': data.get('expiresAt'), - 'next': data.get('next'), - 'data': data.get('data'), - 'error': data.get('error') + 'status': status_data.get('status'), + 'total': status_data.get('total'), + 'completed': status_data.get('completed'), + 'creditsUsed': status_data.get('creditsUsed'), + 'expiresAt': status_data.get('expiresAt'), + 'data': status_data.get('data'), + 'error': status_data.get('error'), + 'next': status_data.get('next', None) } else: self._handle_error(response, 'check crawl status') @@ -377,17 +398,38 @@ class FirecrawlApp: headers = self._prepare_headers() response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: - data = response.json() + status_data = response.json() + if status_data['status'] == 'completed': + if 'data' in status_data: + data = status_data['data'] + while 'next' in status_data: + next_url = status_data.get('next') + if not next_url: + logger.warning("Expected 'next' URL is missing.") + break + try: + status_response = self._get_request(next_url, headers) + if status_response.status_code != 200: + logger.error(f"Failed to fetch next page: {status_response.status_code}") + break + status_data = status_response.json() + data.extend(status_data.get('data', [])) + except Exception as e: + logger.error(f"Error during pagination request: {e}") + break + status_data.pop('next', None) + status_data['data'] = data + return { 'success': True, - 'status': data.get('status'), - 'total': data.get('total'), - 'completed': data.get('completed'), - 'creditsUsed': data.get('creditsUsed'), - 'expiresAt': data.get('expiresAt'), - 'next': data.get('next'), - 'data': data.get('data'), - 'error': data.get('error') + 'status': status_data.get('status'), + 'total': status_data.get('total'), + 'completed': status_data.get('completed'), + 'creditsUsed': status_data.get('creditsUsed'), + 'expiresAt': status_data.get('expiresAt'), + 'data': status_data.get('data'), + 'error': status_data.get('error'), + 'next': status_data.get('next', None) } else: self._handle_error(response, 'check batch scrape status') From 9688bad60d1109b7697c245857a3ef3cf57df37d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 11 Nov 2024 14:48:11 -0500 Subject: [PATCH 2/2] Update __init__.py --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index eee37690..cb897b7e 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.4.1" +__version__ = "1.5.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl")