2024-10-19 03:21:18 +08:00
import csv
import json
2024-10-19 04:57:39 +08:00
import os
2024-10-19 03:21:18 +08:00
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
from openai import OpenAI
from serpapi import GoogleSearch
2024-10-20 20:38:38 +08:00
from swarm import Agent
from swarm . repl import run_demo_loop
2024-10-19 03:21:18 +08:00
load_dotenv ( )
# Initialize FirecrawlApp and OpenAI
app = FirecrawlApp ( api_key = os . getenv ( " FIRECRAWL_API_KEY " ) )
client = OpenAI ( api_key = os . getenv ( " OPENAI_API_KEY " ) )
2024-10-22 00:01:09 +08:00
def crawl_and_analyze_url ( url , objective ) :
""" Crawl a website using Firecrawl and analyze the content. """
2024-10-20 20:38:38 +08:00
print ( f " Parameters: url= { url } , objective= { objective } " )
2024-10-22 00:01:09 +08:00
# Crawl the website
crawl_status = app . crawl_url (
2024-10-19 03:21:18 +08:00
url ,
2024-10-22 00:01:09 +08:00
params = { ' limit ' : 10 , ' scrapeOptions ' : { ' formats ' : [ ' markdown ' ] } } ,
poll_interval = 5
2024-10-19 03:21:18 +08:00
)
2024-10-22 00:01:09 +08:00
crawl_status = crawl_status [ ' data ' ]
# Process each 'markdown' element individually
combined_results = [ ]
for item in crawl_status :
if ' markdown ' in item :
content = item [ ' markdown ' ]
# Analyze the content
analysis = generate_completion (
" website data extractor " ,
f " Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response " ,
" Objective: " + objective + " \n Content: " + content
)
# Parse the JSON result
try :
result = json . loads ( analysis )
combined_results . append ( result )
except json . JSONDecodeError :
print ( f " Could not parse JSON from analysis: { analysis } " )
# Combine the results
return { " objective " : objective , " results " : combined_results }
2024-10-19 03:21:18 +08:00
def generate_completion ( role , task , content ) :
""" Generate a completion using OpenAI. """
2024-10-20 20:38:38 +08:00
print ( f " Parameters: role= { role } , task= { task [ : 50 ] } ..., content= { content [ : 50 ] } ... " )
2024-10-19 03:21:18 +08:00
response = client . chat . completions . create (
model = " gpt-4o " ,
messages = [
{ " role " : " system " , " content " : f " You are a { role } . { task } " } ,
{ " role " : " user " , " content " : content }
]
)
return response . choices [ 0 ] . message . content
2024-10-22 00:01:09 +08:00
def handoff_to_crawl_url ( ) :
""" Hand off the url to the crawl url agent. """
2024-10-20 20:38:38 +08:00
return crawl_website_agent
user_interface_agent = Agent (
name = " User Interface Agent " ,
2024-10-22 00:01:09 +08:00
instructions = " You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise. " ,
functions = [ handoff_to_crawl_url ] ,
2024-10-20 20:38:38 +08:00
)
crawl_website_agent = Agent (
name = " Crawl Website Agent " ,
2024-10-22 00:01:09 +08:00
instructions = " You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console. " ,
functions = [ crawl_and_analyze_url ] ,
2024-10-20 20:38:38 +08:00
)
2024-10-19 03:21:18 +08:00
if __name__ == " __main__ " :
2024-10-20 20:38:38 +08:00
# Run the demo loop with the user interface agent
run_demo_loop ( user_interface_agent , stream = True )