mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
commit
a9e45cdb15
BIN
apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz
vendored
BIN
apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz
vendored
Binary file not shown.
BIN
apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz
vendored
Normal file
BIN
apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz
vendored
Normal file
Binary file not shown.
Binary file not shown.
BIN
apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl
vendored
Normal file
BIN
apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl
vendored
Normal file
Binary file not shown.
|
@ -1,7 +1,160 @@
|
|||
Metadata-Version: 2.1
|
||||
Name: firecrawl-py
|
||||
Version: 0.0.10
|
||||
Version: 0.0.11
|
||||
Summary: Python SDK for Firecrawl API
|
||||
Home-page: https://github.com/mendableai/firecrawl
|
||||
Author: Mendable.ai
|
||||
Author-email: nick@mendable.ai
|
||||
License: GNU General Public License v3 (GPLv3)
|
||||
Project-URL: Documentation, https://docs.firecrawl.dev
|
||||
Project-URL: Source, https://github.com/mendableai/firecrawl
|
||||
Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
|
||||
Keywords: SDK API firecrawl
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Environment :: Web Environment
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Topic :: Internet
|
||||
Classifier: Topic :: Internet :: WWW/HTTP
|
||||
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
||||
Classifier: Topic :: Software Development
|
||||
Classifier: Topic :: Software Development :: Libraries
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Text Processing
|
||||
Classifier: Topic :: Text Processing :: Indexing
|
||||
Requires-Python: >=3.8
|
||||
Description-Content-Type: text/markdown
|
||||
|
||||
# Firecrawl Python SDK
|
||||
|
||||
The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
|
||||
|
||||
## Installation
|
||||
|
||||
To install the Firecrawl Python SDK, you can use pip:
|
||||
|
||||
```bash
|
||||
pip install firecrawl-py
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
||||
|
||||
|
||||
Here's an example of how to use the SDK:
|
||||
|
||||
```python
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
# Initialize the FirecrawlApp with your API key
|
||||
app = FirecrawlApp(api_key='your_api_key')
|
||||
|
||||
# Scrape a single URL
|
||||
url = 'https://mendable.ai'
|
||||
scraped_data = app.scrape_url(url)
|
||||
|
||||
# Crawl a website
|
||||
crawl_url = 'https://mendable.ai'
|
||||
params = {
|
||||
'pageOptions': {
|
||||
'onlyMainContent': True
|
||||
}
|
||||
}
|
||||
crawl_result = app.crawl_url(crawl_url, params=params)
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```python
|
||||
url = 'https://example.com'
|
||||
scraped_data = app.scrape_url(url)
|
||||
```
|
||||
### Extracting structured data from a URL
|
||||
|
||||
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
|
||||
|
||||
```python
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
by: str
|
||||
commentsURL: str
|
||||
|
||||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
data = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
print(data["llm_extraction"])
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
||||
|
||||
```python
|
||||
query = 'what is mendable?'
|
||||
search_result = app.search(query)
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
|
||||
|
||||
```python
|
||||
crawl_url = 'https://example.com'
|
||||
params = {
|
||||
'crawlerOptions': {
|
||||
'excludes': ['blog/*'],
|
||||
'includes': [], # leave empty for all pages
|
||||
'limit': 1000,
|
||||
},
|
||||
'pageOptions': {
|
||||
'onlyMainContent': True
|
||||
}
|
||||
}
|
||||
crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
|
||||
```
|
||||
|
||||
If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
|
||||
|
||||
### Checking Crawl Status
|
||||
|
||||
To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```python
|
||||
job_id = crawl_result['jobId']
|
||||
status = app.check_crawl_status(job_id)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
||||
|
||||
## License
|
||||
|
||||
The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
|
||||
|
|
|
@ -1,14 +1,50 @@
|
|||
from setuptools import setup, find_packages
|
||||
from pathlib import Path
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
this_directory = Path(__file__).parent
|
||||
long_description_content = (this_directory / "README.md").read_text()
|
||||
|
||||
setup(
|
||||
name='firecrawl-py',
|
||||
version='0.0.10',
|
||||
url='https://github.com/mendableai/firecrawl',
|
||||
author='Mendable.ai',
|
||||
author_email='nick@mendable.ai',
|
||||
description='Python SDK for Firecrawl API',
|
||||
name="firecrawl-py",
|
||||
version="0.0.11",
|
||||
url="https://github.com/mendableai/firecrawl",
|
||||
author="Mendable.ai",
|
||||
author_email="nick@mendable.ai",
|
||||
description="Python SDK for Firecrawl API",
|
||||
long_description=long_description_content,
|
||||
long_description_content_type="text/markdown",
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
'requests',
|
||||
"requests",
|
||||
],
|
||||
python_requires='>=3.8',
|
||||
classifiers=[
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Web Environment",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||
"Natural Language :: English",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Topic :: Internet",
|
||||
"Topic :: Internet :: WWW/HTTP",
|
||||
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
||||
"Topic :: Software Development",
|
||||
"Topic :: Software Development :: Libraries",
|
||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||
"Topic :: Text Processing",
|
||||
"Topic :: Text Processing :: Indexing",
|
||||
],
|
||||
keywords="SDK API firecrawl",
|
||||
project_urls={
|
||||
"Documentation": "https://docs.firecrawl.dev",
|
||||
"Source": "https://github.com/mendableai/firecrawl",
|
||||
"Tracker": "https://github.com/mendableai/firecrawl/issues",
|
||||
},
|
||||
license="GNU General Public License v3 (GPLv3)",
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue
Block a user