firecrawl/examples/sales_web_crawler/app.py

79 lines
2.9 KiB
Python
Raw Normal View History

2024-10-19 03:21:18 +08:00
import csv
import json
2024-10-19 04:57:39 +08:00
import os
2024-10-19 03:21:18 +08:00
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
from openai import OpenAI
from serpapi import GoogleSearch
2024-10-20 20:38:38 +08:00
from swarm import Agent
from swarm.repl import run_demo_loop
2024-10-19 03:21:18 +08:00
load_dotenv()
# Initialize FirecrawlApp and OpenAI
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
2024-10-22 00:01:09 +08:00
def crawl_and_analyze_url(url, objective):
"""Crawl a website using Firecrawl and analyze the content."""
2024-10-20 20:38:38 +08:00
print(f"Parameters: url={url}, objective={objective}")
2024-10-22 00:01:09 +08:00
# Crawl the website
crawl_status = app.crawl_url(
2024-10-19 03:21:18 +08:00
url,
2024-10-22 00:01:09 +08:00
params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}},
poll_interval=5
2024-10-19 03:21:18 +08:00
)
2024-10-22 00:01:09 +08:00
crawl_status = crawl_status['data']
# Process each 'markdown' element individually
combined_results = []
for item in crawl_status:
if 'markdown' in item:
content = item['markdown']
# Analyze the content
analysis = generate_completion(
"website data extractor",
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
"Objective: " + objective + "\nContent: " + content
)
# Parse the JSON result
try:
result = json.loads(analysis)
combined_results.append(result)
except json.JSONDecodeError:
print(f"Could not parse JSON from analysis: {analysis}")
# Combine the results
return {"objective": objective, "results": combined_results}
2024-10-19 03:21:18 +08:00
def generate_completion(role, task, content):
"""Generate a completion using OpenAI."""
2024-10-20 20:38:38 +08:00
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
2024-10-19 03:21:18 +08:00
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": f"You are a {role}. {task}"},
{"role": "user", "content": content}
]
)
return response.choices[0].message.content
2024-10-22 00:01:09 +08:00
def handoff_to_crawl_url():
"""Hand off the url to the crawl url agent."""
2024-10-20 20:38:38 +08:00
return crawl_website_agent
user_interface_agent = Agent(
name="User Interface Agent",
2024-10-22 00:01:09 +08:00
instructions="You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise.",
functions=[handoff_to_crawl_url],
2024-10-20 20:38:38 +08:00
)
crawl_website_agent = Agent(
name="Crawl Website Agent",
2024-10-22 00:01:09 +08:00
instructions="You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console.",
functions=[crawl_and_analyze_url],
2024-10-20 20:38:38 +08:00
)
2024-10-19 03:21:18 +08:00
if __name__ == "__main__":
2024-10-20 20:38:38 +08:00
# Run the demo loop with the user interface agent
run_demo_loop(user_interface_agent, stream=True)