mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
fix(v1): update readme - v1.0.1
This commit is contained in:
parent
9e87d05b77
commit
c7b3365ffd
|
@ -18,29 +18,30 @@ npm install @mendable/firecrawl-js
|
|||
Here's an example of how to use the SDK with error handling:
|
||||
|
||||
```js
|
||||
import FirecrawlApp from "@mendable/firecrawl-js";
|
||||
import FirecrawlApp, { CrawlParams, CrawlStatusResponse } from '@mendable/firecrawl-js';
|
||||
|
||||
// Initialize the FirecrawlApp with your API key
|
||||
const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY" });
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a single URL
|
||||
const url = "https://mendable.ai";
|
||||
const scrapedData = await app.scrapeUrl(url);
|
||||
// Scrape a website
|
||||
const scrapeResponse = await app.scrapeUrl('https://firecrawl.dev', {
|
||||
formats: ['markdown', 'html'],
|
||||
});
|
||||
|
||||
if (scrapeResponse) {
|
||||
console.log(scrapeResponse)
|
||||
}
|
||||
|
||||
// Crawl a website
|
||||
const crawlUrl = "https://mendable.ai";
|
||||
const params = {
|
||||
crawlerOptions: {
|
||||
excludes: ["blog/"],
|
||||
includes: [], // leave empty for all pages
|
||||
limit: 1000,
|
||||
},
|
||||
pageOptions: {
|
||||
onlyMainContent: true,
|
||||
},
|
||||
};
|
||||
const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
||||
limit: 100,
|
||||
scrapeOptions: {
|
||||
formats: ['markdown', 'html'],
|
||||
}
|
||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||
|
||||
const crawlResult = await app.crawlUrl(crawlUrl, params);
|
||||
if (crawlResponse) {
|
||||
console.log(crawlResponse)
|
||||
}
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
@ -57,28 +58,16 @@ const scrapedData = await app.scrapeUrl(url);
|
|||
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
```js
|
||||
const crawlUrl = "https://example.com";
|
||||
const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
||||
limit: 100,
|
||||
scrapeOptions: {
|
||||
formats: ['markdown', 'html'],
|
||||
}
|
||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||
|
||||
const params = {
|
||||
crawlerOptions: {
|
||||
excludes: ["blog/"],
|
||||
includes: [], // leave empty for all pages
|
||||
limit: 1000,
|
||||
},
|
||||
pageOptions: {
|
||||
onlyMainContent: true,
|
||||
},
|
||||
};
|
||||
|
||||
const waitUntilDone = true;
|
||||
const pollInterval = 5;
|
||||
|
||||
const crawlResult = await app.crawlUrl(
|
||||
crawlUrl,
|
||||
params,
|
||||
waitUntilDone,
|
||||
pollInterval
|
||||
);
|
||||
if (crawlResponse) {
|
||||
console.log(crawlResponse)
|
||||
}
|
||||
```
|
||||
|
||||
### Checking Crawl Status
|
||||
|
@ -86,7 +75,7 @@ const crawlResult = await app.crawlUrl(
|
|||
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```js
|
||||
const status = await app.checkCrawlStatus(jobId);
|
||||
const status = await app.checkCrawlStatus(id);
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
@ -123,17 +112,13 @@ const scrapeResult = await app.scrapeUrl("https://firecrawl.dev", {
|
|||
console.log(scrapeResult.data["llm_extraction"]);
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
### Map a Website
|
||||
|
||||
With the `search` method, you can search for a query in a search engine and get the top results along with the page content for each result. The method takes the query as a parameter and returns the search results.
|
||||
Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap.
|
||||
|
||||
```js
|
||||
const query = "what is mendable?";
|
||||
const searchResults = await app.search(query, {
|
||||
pageOptions: {
|
||||
fetchPageContent: true, // Fetch the page content for each search result
|
||||
},
|
||||
});
|
||||
const mapResult = await app.mapUrl('https://example.com') as MapResponse;
|
||||
console.log(mapResult)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.0.0",
|
||||
"version": "1.0.1",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "build/cjs/index.js",
|
||||
"types": "types/index.d.ts",
|
||||
|
|
|
@ -18,23 +18,28 @@ pip install firecrawl-py
|
|||
Here's an example of how to use the SDK:
|
||||
|
||||
```python
|
||||
from firecrawl import FirecrawlApp
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
# Initialize the FirecrawlApp with your API key
|
||||
app = FirecrawlApp(api_key='your_api_key')
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
# Scrape a single URL
|
||||
url = 'https://mendable.ai'
|
||||
scraped_data = app.scrape_url(url)
|
||||
# Scrape a website:
|
||||
scrape_status = app.scrape_url(
|
||||
'https://firecrawl.dev',
|
||||
params={'formats': ['markdown', 'html']}
|
||||
)
|
||||
print(scrape_status)
|
||||
|
||||
# Crawl a website
|
||||
crawl_url = 'https://mendable.ai'
|
||||
params = {
|
||||
'pageOptions': {
|
||||
'onlyMainContent': True
|
||||
}
|
||||
}
|
||||
crawl_result = app.crawl_url(crawl_url, params=params)
|
||||
# Crawl a website:
|
||||
crawl_status = app.crawl_url(
|
||||
'https://firecrawl.dev',
|
||||
params={
|
||||
'limit': 100,
|
||||
'scrapeOptions': {'formats': ['markdown', 'html']}
|
||||
},
|
||||
wait_until_done=True,
|
||||
poll_interval=30
|
||||
)
|
||||
print(crawl_status)
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
@ -72,15 +77,6 @@ data = app.scrape_url('https://news.ycombinator.com', {
|
|||
print(data["llm_extraction"])
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
||||
|
||||
```python
|
||||
query = 'what is mendable?'
|
||||
search_result = app.search(query)
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
@ -88,18 +84,16 @@ To crawl a website, use the `crawl_url` method. It takes the starting URL and op
|
|||
The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
|
||||
|
||||
```python
|
||||
crawl_url = 'https://example.com'
|
||||
params = {
|
||||
'crawlerOptions': {
|
||||
'excludes': ['blog/*'],
|
||||
'includes': [], # leave empty for all pages
|
||||
'limit': 1000,
|
||||
},
|
||||
'pageOptions': {
|
||||
'onlyMainContent': True
|
||||
}
|
||||
}
|
||||
crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
|
||||
crawl_status = app.crawl_url(
|
||||
'https://firecrawl.dev',
|
||||
params={
|
||||
'limit': 100,
|
||||
'scrapeOptions': {'formats': ['markdown', 'html']}
|
||||
},
|
||||
wait_until_done=True,
|
||||
poll_interval=30
|
||||
)
|
||||
print(crawl_status)
|
||||
```
|
||||
|
||||
If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
|
||||
|
@ -109,8 +103,18 @@ If `wait_until_done` is set to `True`, the `crawl_url` method will return the cr
|
|||
To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```python
|
||||
job_id = crawl_result['jobId']
|
||||
status = app.check_crawl_status(job_id)
|
||||
id = crawl_result['id']
|
||||
status = app.check_crawl_status(id)
|
||||
```
|
||||
|
||||
### Map a Website
|
||||
|
||||
Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap.
|
||||
|
||||
```python
|
||||
# Map a website:
|
||||
map_result = app.map_url('https://example.com')
|
||||
print(map_result)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
|
|
@ -13,7 +13,7 @@ import os
|
|||
|
||||
from .firecrawl import FirecrawlApp
|
||||
|
||||
__version__ = "1.0.0"
|
||||
__version__ = "1.0.1"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
|
Loading…
Reference in New Issue
Block a user