Create app.py

2024-11-15 19:22:19 +08:00 · 2024-10-19 00:51:18 +05:30 · 2024-10-19 00:51:18 +05:30 · 10381b5d3c
commit 10381b5d3c
parent 6ebfcc85cf
1 changed files with 99 additions and 0 deletions
--- a/examples/sales_web_crawler/app.py
+++ b/examples/sales_web_crawler/app.py
@ -0,0 +1,99 @@
+import os
+import csv
+import json
+
+from dotenv import load_dotenv
+from firecrawl import FirecrawlApp
+from openai import OpenAI
+from serpapi import GoogleSearch
+
+load_dotenv()
+
+# Initialize FirecrawlApp and OpenAI
+app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+def search_google(query, objective):
+    """Search Google using SerpAPI."""
+    print(f"Parameters: query={query}, objective={objective}")
+    search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
+    results = search.get_dict().get("organic_results", [])
+    return {"objective": objective, "results": results}
+
+def scrape_url(url, objective):
+    """Scrape a website using Firecrawl."""
+    print(f"Parameters: url={url}, objective={objective}")
+    scrape_status = app.scrape_url(
+        url,
+        params={'formats': ['markdown']}
+    )
+    return {"objective": objective, "results": scrape_status}
+
+def crawl_url(url, objective):
+    """Crawl a website using Firecrawl."""
+    print(f"Parameters: url={url}, objective={objective}")
+    # If using a crawled url set, pass the ID in the function call below
+    # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
+    # scrape_status['results'] = scrape_status['data']
+    scrape_status = app.crawl_url(
+        url,
+        params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}
+    )
+    return {"objective": objective, "results": scrape_status}
+
+def analyze_website_content(content, objective):
+    """Analyze the scraped website content using OpenAI."""
+    print(f"Parameters: content={content[:50]}..., objective={objective}")
+    analysis = generate_completion(
+        "website data extractor",
+        f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
+        "Objective: " + objective + "\nContent: " + content
+    )
+    return {"objective": objective, "results": analysis}
+
+def generate_completion(role, task, content):
+    """Generate a completion using OpenAI."""
+    print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
+    response = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {"role": "system", "content": f"You are a {role}. {task}"},
+            {"role": "user", "content": content}
+        ]
+    )
+    return response.choices[0].message.content
+
+def read_websites_from_csv(file_path):
+    """Read websites from a CSV file."""
+    websites = []
+    with open(file_path, mode='r') as file:
+        csv_reader = csv.DictReader(file)
+        for row in csv_reader:
+            websites.append(row['website'])
+    return websites
+
+def write_results_to_json(results, file_path):
+    """Write results to a JSON file."""
+    with open(file_path, mode='w') as file:
+        json.dump(results, file, indent=4)
+
+def process_websites(file_path):
+    """Process websites from a CSV file and write results to a new JSON file."""
+    results = []
+    websites = read_websites_from_csv(file_path)
+    for website in websites:
+        search_results = search_google(website, "Search website")
+        if search_results['results']:
+            top_result = search_results['results'][0]
+            url = top_result['link']
+            crawl_results = crawl_url(url, "Crawl website")
+            if crawl_results['results']:
+                for each_result in crawl_results['results']['data'][:2]:
+                    analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.")
+                    print(analysis_results['results'])
+                    results.append(json.loads(analysis_results['results']))
+    write_results_to_json(results, 'enriched_data.json')
+
+if __name__ == "__main__":
+    # Process websites from the CSV file
+    process_websites('websites.csv')