mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
99 lines
2.7 KiB
TypeScript
99 lines
2.7 KiB
TypeScript
//@ts-ignore
|
|
import * as fs from 'fs'
|
|
import FirecrawlApp from '@mendable/firecrawl-js'
|
|
import 'dotenv/config'
|
|
import { config } from 'dotenv'
|
|
import { z } from 'zod'
|
|
|
|
config()
|
|
|
|
export async function scrapeAirbnb() {
|
|
try {
|
|
// Initialize the FirecrawlApp with your API key
|
|
const app = new FirecrawlApp({ apiKey: process.env.FIRECRAWL_API_KEY })
|
|
|
|
// Define the URL to crawl
|
|
const listingsUrl =
|
|
'https://www.airbnb.com/s/San-Francisco--CA--United-States/homes'
|
|
|
|
const baseUrl = 'https://www.airbnb.com'
|
|
// Define schema to extract pagination links
|
|
const paginationSchema = z.object({
|
|
page_links: z
|
|
.array(
|
|
z.object({
|
|
link: z.string(),
|
|
})
|
|
)
|
|
.describe('Pagination links in the bottom of the page.'),
|
|
})
|
|
|
|
const params2 = {
|
|
pageOptions: {
|
|
onlyMainContent: false,
|
|
},
|
|
extractorOptions: { extractionSchema: paginationSchema },
|
|
timeout: 50000, // if needed, sometimes airbnb stalls...
|
|
}
|
|
|
|
// Start crawling to get pagination links
|
|
const linksData = await app.scrapeUrl(listingsUrl, params2)
|
|
console.log(linksData.data['llm_extraction'])
|
|
|
|
let paginationLinks = linksData.data['llm_extraction'].page_links.map(
|
|
(link) => baseUrl + link.link
|
|
)
|
|
|
|
// Just in case is not able to get the pagination links
|
|
if (paginationLinks.length === 0) {
|
|
paginationLinks = [listingsUrl]
|
|
}
|
|
|
|
// Define schema to extract listings
|
|
const schema = z.object({
|
|
listings: z
|
|
.array(
|
|
z.object({
|
|
title: z.string(),
|
|
price_per_night: z.number(),
|
|
location: z.string(),
|
|
rating: z.number().optional(),
|
|
reviews: z.number().optional(),
|
|
})
|
|
)
|
|
.describe('Airbnb listings in San Francisco'),
|
|
})
|
|
|
|
const params = {
|
|
pageOptions: {
|
|
onlyMainContent: false,
|
|
},
|
|
extractorOptions: { extractionSchema: schema },
|
|
}
|
|
|
|
// Function to scrape a single URL
|
|
const scrapeListings = async (url) => {
|
|
const result = await app.scrapeUrl(url, params)
|
|
return result.data['llm_extraction'].listings
|
|
}
|
|
|
|
// Scrape all pagination links in parallel
|
|
const listingsPromises = paginationLinks.map((link) => scrapeListings(link))
|
|
const listingsResults = await Promise.all(listingsPromises)
|
|
|
|
// Flatten the results
|
|
const allListings = listingsResults.flat()
|
|
|
|
// Save the listings to a file
|
|
fs.writeFileSync(
|
|
'airbnb_listings.json',
|
|
JSON.stringify(allListings, null, 2)
|
|
)
|
|
// Read the listings from the file
|
|
const listingsData = fs.readFileSync('airbnb_listings.json', 'utf8')
|
|
return listingsData
|
|
} catch (error) {
|
|
console.error('An error occurred:', error.message)
|
|
}
|
|
}
|