mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge pull request #137 from mendableai/nsc/llm-extraction-zod-integration
[Docs] Updated examples
This commit is contained in:
commit
832a4f53e0
|
@ -1,7 +1,13 @@
|
|||
import FirecrawlApp from '@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "YOUR_API_KEY"});
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
console.log(scrapeResult.data.content)
|
||||
|
||||
// Crawl a website:
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||
console.log(crawlResult)
|
||||
|
||||
|
@ -17,4 +23,61 @@ while (true) {
|
|||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
console.log(job.data[0].content);
|
||||
console.log(job.data[0].content);
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
console.log(searchResult)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
83
apps/js-sdk/example.ts
Normal file
83
apps/js-sdk/example.ts
Normal file
|
@ -0,0 +1,83 @@
|
|||
import FirecrawlApp, { JobStatusResponse } from '@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
console.log(scrapeResult.data.content)
|
||||
|
||||
// Crawl a website:
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId: string = await crawlResult['jobId'];
|
||||
console.log(jobId);
|
||||
|
||||
let job: JobStatusResponse;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId);
|
||||
if (job.status === 'completed') {
|
||||
break;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
console.log(job.data[0].content);
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
console.log(searchResult)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
|
@ -77,6 +77,42 @@ To scrape a single URL with error handling, use the `scrapeUrl` method. It takes
|
|||
scrapeExample();
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
||||
With LLM extraction, you can easily extract structured data from any URL. We support zod schemas to make it easier for you too. Here is how you to use it:
|
||||
|
||||
```js
|
||||
import { z } from "zod";
|
||||
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
||||
|
||||
```js
|
||||
query = 'what is mendable?'
|
||||
searchResult = app.search(query)
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "0.0.19",
|
||||
"version": "0.0.20",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "build/index.js",
|
||||
"types": "types/index.d.ts",
|
||||
|
|
|
@ -1,20 +1,19 @@
|
|||
from firecrawl import FirecrawlApp
|
||||
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
# Scrape a website:
|
||||
scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
|
||||
# Crawl a website:
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
||||
print(crawl_result)
|
||||
|
||||
print(crawl_result[0]['markdown'])
|
||||
|
||||
job_id = crawl_result['jobId']
|
||||
print(job_id)
|
||||
|
||||
status = app.check_crawl_status(job_id)
|
||||
print(status)
|
||||
|
||||
# LLM Extraction:
|
||||
# Define schema to extract contents into using pydantic
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Optional
|
||||
from typing import List
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
|
@ -25,7 +24,7 @@ class ArticleSchema(BaseModel):
|
|||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
a = app.scrape_url('https://news.ycombinator.com', {
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||
'mode': 'llm-extraction'
|
||||
|
@ -35,3 +34,40 @@ a = app.scrape_url('https://news.ycombinator.com', {
|
|||
}
|
||||
})
|
||||
|
||||
print(llm_extraction_result['llm_extraction'])
|
||||
|
||||
# Define schema to extract contents into using json schema
|
||||
json_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': json_schema,
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
|
||||
print(llm_extraction_result['llm_extraction'])
|
Loading…
Reference in New Issue
Block a user