2024-05-23 23:52:59 +08:00
|
|
|
import uuid
|
|
|
|
from firecrawl.firecrawl import FirecrawlApp
|
2024-04-16 05:01:47 +08:00
|
|
|
|
2024-05-09 08:16:59 +08:00
|
|
|
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
2024-04-16 05:01:47 +08:00
|
|
|
|
2024-05-09 21:36:56 +08:00
|
|
|
# Scrape a website:
|
|
|
|
scrape_result = app.scrape_url('firecrawl.dev')
|
|
|
|
print(scrape_result['markdown'])
|
2024-05-09 08:36:40 +08:00
|
|
|
|
2024-05-09 21:36:56 +08:00
|
|
|
# Crawl a website:
|
2024-05-23 23:52:59 +08:00
|
|
|
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
|
|
|
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
|
2024-05-09 21:36:56 +08:00
|
|
|
print(crawl_result)
|
2024-05-09 08:16:59 +08:00
|
|
|
|
2024-05-09 21:36:56 +08:00
|
|
|
# LLM Extraction:
|
|
|
|
# Define schema to extract contents into using pydantic
|
2024-05-09 08:16:59 +08:00
|
|
|
from pydantic import BaseModel, Field
|
2024-05-09 21:36:56 +08:00
|
|
|
from typing import List
|
2024-05-09 08:16:59 +08:00
|
|
|
|
|
|
|
class ArticleSchema(BaseModel):
|
|
|
|
title: str
|
|
|
|
points: int
|
|
|
|
by: str
|
|
|
|
commentsURL: str
|
|
|
|
|
|
|
|
class TopArticlesSchema(BaseModel):
|
|
|
|
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
|
|
|
|
2024-05-09 21:36:56 +08:00
|
|
|
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
2024-05-09 08:16:59 +08:00
|
|
|
'extractorOptions': {
|
|
|
|
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
|
|
|
'mode': 'llm-extraction'
|
|
|
|
},
|
|
|
|
'pageOptions':{
|
|
|
|
'onlyMainContent': True
|
|
|
|
}
|
|
|
|
})
|
2024-04-16 05:01:47 +08:00
|
|
|
|
2024-05-09 21:36:56 +08:00
|
|
|
print(llm_extraction_result['llm_extraction'])
|
|
|
|
|
|
|
|
# Define schema to extract contents into using json schema
|
|
|
|
json_schema = {
|
|
|
|
"type": "object",
|
|
|
|
"properties": {
|
|
|
|
"top": {
|
|
|
|
"type": "array",
|
|
|
|
"items": {
|
|
|
|
"type": "object",
|
|
|
|
"properties": {
|
|
|
|
"title": {"type": "string"},
|
|
|
|
"points": {"type": "number"},
|
|
|
|
"by": {"type": "string"},
|
|
|
|
"commentsURL": {"type": "string"}
|
|
|
|
},
|
|
|
|
"required": ["title", "points", "by", "commentsURL"]
|
|
|
|
},
|
|
|
|
"minItems": 5,
|
|
|
|
"maxItems": 5,
|
|
|
|
"description": "Top 5 stories on Hacker News"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"required": ["top"]
|
|
|
|
}
|
|
|
|
|
|
|
|
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
|
|
|
'extractorOptions': {
|
|
|
|
'extractionSchema': json_schema,
|
|
|
|
'mode': 'llm-extraction'
|
|
|
|
},
|
|
|
|
'pageOptions':{
|
|
|
|
'onlyMainContent': True
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
|
|
|
print(llm_extraction_result['llm_extraction'])
|