firecrawl/apps/python-sdk/example.py

from firecrawl import FirecrawlApp

app = FirecrawlApp(api_key="fc-YOUR_API_KEY")

# Scrape a website:
scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown'])

# Crawl a website:
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
print(crawl_result)

# LLM Extraction:
# Define schema to extract contents into using pydantic
from pydantic import BaseModel, Field
from typing import List

class ArticleSchema(BaseModel):
    title: str
    points: int 
    by: str
    commentsURL: str

class TopArticlesSchema(BaseModel):
    top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")

llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
    'extractorOptions': {
        'extractionSchema': TopArticlesSchema.model_json_schema(),
        'mode': 'llm-extraction'
    },
    'pageOptions':{
        'onlyMainContent': True
    }
})

print(llm_extraction_result['llm_extraction'])

# Define schema to extract contents into using json schema
json_schema = {
  "type": "object",
  "properties": {
    "top": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "title": {"type": "string"},
          "points": {"type": "number"},
          "by": {"type": "string"},
          "commentsURL": {"type": "string"}
        },
        "required": ["title", "points", "by", "commentsURL"]
      },
      "minItems": 5,
      "maxItems": 5,
      "description": "Top 5 stories on Hacker News"
    }
  },
  "required": ["top"]
}

llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
    'extractorOptions': {
        'extractionSchema': json_schema,
        'mode': 'llm-extraction'
    },
    'pageOptions':{
        'onlyMainContent': True
    }
})

print(llm_extraction_result['llm_extraction'])
Initial commit 2024-04-16 05:01:47 +08:00			`from firecrawl import FirecrawlApp`

Nick: fixes js and pydantic implementation 2024-05-09 08:16:59 +08:00			`app = FirecrawlApp(api_key="fc-YOUR_API_KEY")`
Initial commit 2024-04-16 05:01:47 +08:00
Updated docs 2024-05-09 21:36:56 +08:00			`# Scrape a website:`
			`scrape_result = app.scrape_url('firecrawl.dev')`
			`print(scrape_result['markdown'])`
Update example.py 2024-05-09 08:36:40 +08:00
Updated docs 2024-05-09 21:36:56 +08:00			`# Crawl a website:`
			`crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})`
			`print(crawl_result)`
Nick: fixes js and pydantic implementation 2024-05-09 08:16:59 +08:00
Updated docs 2024-05-09 21:36:56 +08:00			`# LLM Extraction:`
			`# Define schema to extract contents into using pydantic`
Nick: fixes js and pydantic implementation 2024-05-09 08:16:59 +08:00			`from pydantic import BaseModel, Field`
Updated docs 2024-05-09 21:36:56 +08:00			`from typing import List`
Nick: fixes js and pydantic implementation 2024-05-09 08:16:59 +08:00
			`class ArticleSchema(BaseModel):`
			`title: str`
			`points: int`
			`by: str`
			`commentsURL: str`

			`class TopArticlesSchema(BaseModel):`
			`top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")`

Updated docs 2024-05-09 21:36:56 +08:00			`llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {`
Nick: fixes js and pydantic implementation 2024-05-09 08:16:59 +08:00			`'extractorOptions': {`
			`'extractionSchema': TopArticlesSchema.model_json_schema(),`
			`'mode': 'llm-extraction'`
			`},`
			`'pageOptions':{`
			`'onlyMainContent': True`
			`}`
			`})`
Initial commit 2024-04-16 05:01:47 +08:00
Updated docs 2024-05-09 21:36:56 +08:00			`print(llm_extraction_result['llm_extraction'])`

			`# Define schema to extract contents into using json schema`
			`json_schema = {`
			`"type": "object",`
			`"properties": {`
			`"top": {`
			`"type": "array",`
			`"items": {`
			`"type": "object",`
			`"properties": {`
			`"title": {"type": "string"},`
			`"points": {"type": "number"},`
			`"by": {"type": "string"},`
			`"commentsURL": {"type": "string"}`
			`},`
			`"required": ["title", "points", "by", "commentsURL"]`
			`},`
			`"minItems": 5,`
			`"maxItems": 5,`
			`"description": "Top 5 stories on Hacker News"`
			`}`
			`},`
			`"required": ["top"]`
			`}`

			`llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {`
			`'extractorOptions': {`
			`'extractionSchema': json_schema,`
			`'mode': 'llm-extraction'`
			`},`
			`'pageOptions':{`
			`'onlyMainContent': True`
			`}`
			`})`

			`print(llm_extraction_result['llm_extraction'])`