2024-10-19 03:21:18 +08:00
import csv
import json
2024-10-19 04:57:39 +08:00
import os
import uuid
2024-10-19 03:21:18 +08:00
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
from openai import OpenAI
from serpapi import GoogleSearch
2024-10-19 04:57:39 +08:00
from tqdm import tqdm
2024-10-19 03:21:18 +08:00
load_dotenv ( )
# Initialize FirecrawlApp and OpenAI
app = FirecrawlApp ( api_key = os . getenv ( " FIRECRAWL_API_KEY " ) )
client = OpenAI ( api_key = os . getenv ( " OPENAI_API_KEY " ) )
def search_google ( query , objective ) :
""" Search Google using SerpAPI. """
2024-10-19 04:57:39 +08:00
# print(f"Parameters: query={query}, objective={objective}")
2024-10-19 03:21:18 +08:00
search = GoogleSearch ( { " q " : query , " api_key " : os . getenv ( " SERP_API_KEY " ) } )
results = search . get_dict ( ) . get ( " organic_results " , [ ] )
return { " objective " : objective , " results " : results }
def scrape_url ( url , objective ) :
""" Scrape a website using Firecrawl. """
2024-10-19 04:57:39 +08:00
# print(f"Parameters: url={url}, objective={objective}")
2024-10-19 03:21:18 +08:00
scrape_status = app . scrape_url (
url ,
params = { ' formats ' : [ ' markdown ' ] }
)
return { " objective " : objective , " results " : scrape_status }
def crawl_url ( url , objective ) :
""" Crawl a website using Firecrawl. """
2024-10-19 04:57:39 +08:00
# print(f"Parameters: url={url}, objective={objective}")
2024-10-19 03:21:18 +08:00
# If using a crawled url set, pass the ID in the function call below
# scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
# scrape_status['results'] = scrape_status['data']
scrape_status = app . crawl_url (
url ,
2024-10-19 04:57:39 +08:00
params = { ' limit ' : 5 , ' scrapeOptions ' : { ' formats ' : [ ' markdown ' ] } }
2024-10-19 03:21:18 +08:00
)
return { " objective " : objective , " results " : scrape_status }
def analyze_website_content ( content , objective ) :
""" Analyze the scraped website content using OpenAI. """
2024-10-19 04:57:39 +08:00
# print(f"Parameters: content={content[:50]}..., objective={objective}")
2024-10-19 03:21:18 +08:00
analysis = generate_completion (
" website data extractor " ,
f " Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response " ,
" Objective: " + objective + " \n Content: " + content
)
return { " objective " : objective , " results " : analysis }
def generate_completion ( role , task , content ) :
""" Generate a completion using OpenAI. """
2024-10-19 04:57:39 +08:00
# print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
2024-10-19 03:21:18 +08:00
response = client . chat . completions . create (
model = " gpt-4o " ,
messages = [
{ " role " : " system " , " content " : f " You are a { role } . { task } " } ,
{ " role " : " user " , " content " : content }
]
)
return response . choices [ 0 ] . message . content
def read_websites_from_csv ( file_path ) :
""" Read websites from a CSV file. """
websites = [ ]
with open ( file_path , mode = ' r ' ) as file :
csv_reader = csv . DictReader ( file )
for row in csv_reader :
websites . append ( row [ ' website ' ] )
return websites
def write_results_to_json ( results , file_path ) :
""" Write results to a JSON file. """
with open ( file_path , mode = ' w ' ) as file :
json . dump ( results , file , indent = 4 )
def process_websites ( file_path ) :
""" Process websites from a CSV file and write results to a new JSON file. """
results = [ ]
websites = read_websites_from_csv ( file_path )
for website in websites :
search_results = search_google ( website , " Search website " )
if search_results [ ' results ' ] :
top_result = search_results [ ' results ' ] [ 0 ]
url = top_result [ ' link ' ]
2024-10-19 04:57:39 +08:00
unique_filename = f ' output_ { uuid . uuid4 ( ) } .json '
2024-10-19 03:21:18 +08:00
crawl_results = crawl_url ( url , " Crawl website " )
if crawl_results [ ' results ' ] :
2024-10-19 04:57:39 +08:00
for each_result in tqdm ( crawl_results [ ' results ' ] [ ' data ' ] , desc = " Analyzing crawl results " ) :
analysis_results = analyze_website_content ( each_result [ ' markdown ' ] , " Extract emails, names, and titles of the people and companies found. " )
try :
result = json . loads ( analysis_results [ ' results ' ] )
if result :
results . append ( result )
write_results_to_json ( results , unique_filename )
except :
continue
2024-10-19 03:21:18 +08:00
if __name__ == " __main__ " :
# Process websites from the CSV file
process_websites ( ' websites.csv ' )