diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz deleted file mode 100644 index 59e42d15..00000000 Binary files a/apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz and /dev/null differ diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz new file mode 100644 index 00000000..3dd82a93 Binary files /dev/null and b/apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl deleted file mode 100644 index 36676fe1..00000000 Binary files a/apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl and /dev/null differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl new file mode 100644 index 00000000..4d7e3aac Binary files /dev/null and b/apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl differ diff --git a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO index 16af620a..7d4b8350 100644 --- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO +++ b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO @@ -1,7 +1,160 @@ Metadata-Version: 2.1 Name: firecrawl-py -Version: 0.0.10 +Version: 0.0.11 Summary: Python SDK for Firecrawl API Home-page: https://github.com/mendableai/firecrawl Author: Mendable.ai Author-email: nick@mendable.ai +License: GNU General Public License v3 (GPLv3) +Project-URL: Documentation, https://docs.firecrawl.dev +Project-URL: Source, https://github.com/mendableai/firecrawl +Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues +Keywords: SDK API firecrawl +Classifier: Development Status :: 5 - Production/Stable +Classifier: Environment :: Web Environment +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) +Classifier: Natural Language :: English +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Topic :: Internet +Classifier: Topic :: Internet :: WWW/HTTP +Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search +Classifier: Topic :: Software Development +Classifier: Topic :: Software Development :: Libraries +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Text Processing +Classifier: Topic :: Text Processing :: Indexing +Requires-Python: >=3.8 +Description-Content-Type: text/markdown + +# Firecrawl Python SDK + +The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. + +## Installation + +To install the Firecrawl Python SDK, you can use pip: + +```bash +pip install firecrawl-py +``` + +## Usage + +1. Get an API key from [firecrawl.dev](https://firecrawl.dev) +2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. + + +Here's an example of how to use the SDK: + +```python +from firecrawl import FirecrawlApp + +# Initialize the FirecrawlApp with your API key +app = FirecrawlApp(api_key='your_api_key') + +# Scrape a single URL +url = 'https://mendable.ai' +scraped_data = app.scrape_url(url) + +# Crawl a website +crawl_url = 'https://mendable.ai' +params = { + 'pageOptions': { + 'onlyMainContent': True + } +} +crawl_result = app.crawl_url(crawl_url, params=params) +``` + +### Scraping a URL + +To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary. + +```python +url = 'https://example.com' +scraped_data = app.scrape_url(url) +``` +### Extracting structured data from a URL + +With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it: + +```python +class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + +class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") + +data = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': TopArticlesSchema.model_json_schema(), + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) +print(data["llm_extraction"]) +``` + +### Search for a query + +Used to search the web, get the most relevant results, scrap each page and return the markdown. + +```python +query = 'what is mendable?' +search_result = app.search(query) +``` + +### Crawling a Website + +To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. + +The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method. + +```python +crawl_url = 'https://example.com' +params = { + 'crawlerOptions': { + 'excludes': ['blog/*'], + 'includes': [], # leave empty for all pages + 'limit': 1000, + }, + 'pageOptions': { + 'onlyMainContent': True + } +} +crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5) +``` + +If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. + +### Checking Crawl Status + +To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job. + +```python +job_id = crawl_result['jobId'] +status = app.check_crawl_status(job_id) +``` + +## Error Handling + +The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. + +## Contributing + +Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. + +## License + +The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index de49e1a5..63b5c9fc 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -1,14 +1,50 @@ -from setuptools import setup, find_packages +from pathlib import Path + +from setuptools import find_packages, setup + +this_directory = Path(__file__).parent +long_description_content = (this_directory / "README.md").read_text() setup( - name='firecrawl-py', - version='0.0.10', - url='https://github.com/mendableai/firecrawl', - author='Mendable.ai', - author_email='nick@mendable.ai', - description='Python SDK for Firecrawl API', + name="firecrawl-py", + version="0.0.11", + url="https://github.com/mendableai/firecrawl", + author="Mendable.ai", + author_email="nick@mendable.ai", + description="Python SDK for Firecrawl API", + long_description=long_description_content, + long_description_content_type="text/markdown", packages=find_packages(), install_requires=[ - 'requests', + "requests", ], + python_requires='>=3.8', + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Environment :: Web Environment", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Internet", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Internet :: WWW/HTTP :: Indexing/Search", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing", + "Topic :: Text Processing :: Indexing", + ], + keywords="SDK API firecrawl", + project_urls={ + "Documentation": "https://docs.firecrawl.dev", + "Source": "https://github.com/mendableai/firecrawl", + "Tracker": "https://github.com/mendableai/firecrawl/issues", + }, + license="GNU General Public License v3 (GPLv3)", )