firecrawl/examples/crm_lead_enrichment/crm_lead_enrichment.py
2024-10-09 17:29:39 -04:00

137 lines
4.6 KiB
Python

import json
import os
from dotenv import load_dotenv
from openai import OpenAI
from hubspot import HubSpot
from firecrawl import FirecrawlApp
# Load environment variables
load_dotenv()
# Initialize clients
def initialize_clients():
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")
hubspot_api_key = os.getenv("HUBSPOT_API_KEY")
openai_client = OpenAI(api_key=openai_api_key)
hubspot_client = HubSpot(access_token=hubspot_api_key)
firecrawl_client = FirecrawlApp(api_key=firecrawl_api_key)
return openai_client, hubspot_client, firecrawl_client
# Get list of companies from HubSpot
def get_companies_from_hubspot(hubspot_client):
companies = []
after = None
while True:
try:
response = hubspot_client.crm.companies.basic_api.get_page(
limit=100,
properties=["name", "website"],
after=after
)
companies.extend(response.results)
if not response.paging:
break
after = response.paging.next.after
except Exception as e:
print(f"Error fetching companies from HubSpot: {str(e)}")
break
return [company for company in companies if company.properties.get("website")]
# Scrape URL using Firecrawl
def scrape_url(firecrawl_client, url):
try:
return firecrawl_client.scrape_url(url, params={'formats': ['markdown']})
except Exception as e:
print(f"Error scraping URL {url}: {str(e)}")
return None
# Extract information using OpenAI
def extract_info(openai_client, content):
prompt = f"""
Based on the markdown content, extract the following information in JSON format:
{{
"is_open_source": boolean,
"value_proposition": "string",
"main_product": "string",
"potential_scraping_use": "string"
}}
Are they open source?
What is their value proposition?
What are their main products?
How could they use a web scraping service in one one their products?
Markdown content:
{content}
Respond only with the JSON object, ensuring all fields are present even if the information is not found (use null in that case). Do not include the markdown code snippet like ```json or ``` at all in the response.
"""
try:
completion = openai_client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
return json.loads(completion.choices[0].message.content)
except Exception as e:
print(f"Error extracting information: {str(e)}")
print(completion.choices[0].message.content)
return None
# Update company properties in HubSpot
def update_hubspot(hubspot_client, company, extracted_info):
try:
hubspot_client.crm.companies.basic_api.update(
company_id=company.id,
simple_public_object_input={
"properties": {
"is_open_source": str(extracted_info["is_open_source"]).lower(),
"value_prop": extracted_info["value_proposition"],
"main_products_offered": extracted_info["main_product"],
"how_they_can_use_scraping": extracted_info["potential_scraping_use"]
}
}
)
print(f"Successfully updated HubSpot for company {company.properties['name']}")
except Exception as e:
print(f"Error updating HubSpot for company {company.properties['name']}: {str(e)}")
# Main process
def main():
openai_client, hubspot_client, firecrawl_client = initialize_clients()
companies = get_companies_from_hubspot(hubspot_client)
scraped_data = []
for company in companies:
company_name = company.properties.get("name", "Unknown")
url = company.properties["website"]
print(f"Processing {company_name} at {url}...")
scrape_status = scrape_url(firecrawl_client, url)
if not scrape_status:
continue
extracted_info = extract_info(openai_client, scrape_status["content"])
if not extracted_info:
continue
update_hubspot(hubspot_client, company, extracted_info)
scraped_data.append({
"company": company_name,
"url": url,
"markdown": scrape_status["content"],
"extracted_info": extracted_info
})
print(f"Successfully processed {company_name}")
print(json.dumps(extracted_info, indent=2))
print(f"Scraped, analyzed, and updated {len(scraped_data)} companies")
if __name__ == "__main__":
main()