firecrawl/examples/scrape_and_analyze_airbnb_data_e2b/scraping.ts
2024-06-21 15:40:46 -04:00

99 lines
2.7 KiB
TypeScript

//@ts-ignore
import * as fs from 'fs'
import FirecrawlApp from '@mendable/firecrawl-js'
import 'dotenv/config'
import { config } from 'dotenv'
import { z } from 'zod'
config()
export async function scrapeAirbnb() {
try {
// Initialize the FirecrawlApp with your API key
const app = new FirecrawlApp({ apiKey: process.env.FIRECRAWL_API_KEY })
// Define the URL to crawl
const listingsUrl =
'https://www.airbnb.com/s/San-Francisco--CA--United-States/homes'
const baseUrl = 'https://www.airbnb.com'
// Define schema to extract pagination links
const paginationSchema = z.object({
page_links: z
.array(
z.object({
link: z.string(),
})
)
.describe('Pagination links in the bottom of the page.'),
})
const params2 = {
pageOptions: {
onlyMainContent: false,
},
extractorOptions: { extractionSchema: paginationSchema },
timeout: 50000, // if needed, sometimes airbnb stalls...
}
// Start crawling to get pagination links
const linksData = await app.scrapeUrl(listingsUrl, params2)
console.log(linksData.data['llm_extraction'])
let paginationLinks = linksData.data['llm_extraction'].page_links.map(
(link) => baseUrl + link.link
)
// Just in case is not able to get the pagination links
if (paginationLinks.length === 0) {
paginationLinks = [listingsUrl]
}
// Define schema to extract listings
const schema = z.object({
listings: z
.array(
z.object({
title: z.string(),
price_per_night: z.number(),
location: z.string(),
rating: z.number().optional(),
reviews: z.number().optional(),
})
)
.describe('Airbnb listings in San Francisco'),
})
const params = {
pageOptions: {
onlyMainContent: false,
},
extractorOptions: { extractionSchema: schema },
}
// Function to scrape a single URL
const scrapeListings = async (url) => {
const result = await app.scrapeUrl(url, params)
return result.data['llm_extraction'].listings
}
// Scrape all pagination links in parallel
const listingsPromises = paginationLinks.map((link) => scrapeListings(link))
const listingsResults = await Promise.all(listingsPromises)
// Flatten the results
const allListings = listingsResults.flat()
// Save the listings to a file
fs.writeFileSync(
'airbnb_listings.json',
JSON.stringify(allListings, null, 2)
)
// Read the listings from the file
const listingsData = fs.readFileSync('airbnb_listings.json', 'utf8')
return listingsData
} catch (error) {
console.error('An error occurred:', error.message)
}
}