firecrawl/apps/playwright-service-ts/api.ts

228 lines
6.0 KiB
TypeScript
Raw Permalink Normal View History

2024-06-27 03:32:30 +08:00
import express, { Request, Response } from 'express';
import bodyParser from 'body-parser';
import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright';
import dotenv from 'dotenv';
import randomUseragent from 'random-useragent';
import { getError } from './helpers/get_error';
dotenv.config();
const app = express();
const port = process.env.PORT || 3003;
2024-06-27 03:32:30 +08:00
app.use(bodyParser.json());
const BLOCK_MEDIA = (process.env.BLOCK_MEDIA || 'False').toUpperCase() === 'TRUE';
const PROXY_SERVER = process.env.PROXY_SERVER || null;
const PROXY_USERNAME = process.env.PROXY_USERNAME || null;
const PROXY_PASSWORD = process.env.PROXY_PASSWORD || null;
const AD_SERVING_DOMAINS = [
'doubleclick.net',
'adservice.google.com',
'googlesyndication.com',
'googletagservices.com',
'googletagmanager.com',
'google-analytics.com',
'adsystem.com',
'adservice.com',
'adnxs.com',
'ads-twitter.com',
'facebook.net',
'fbcdn.net',
'amazon-adsystem.com'
];
interface UrlModel {
url: string;
wait_after_load?: number;
timeout?: number;
headers?: { [key: string]: string };
check_selector?: string;
}
let browser: Browser;
let context: BrowserContext;
const initializeBrowser = async () => {
browser = await chromium.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--single-process',
'--disable-gpu'
]
});
const userAgent = randomUseragent.getRandom();
const viewport = { width: 1280, height: 800 };
const contextOptions: any = {
userAgent,
viewport,
};
if (PROXY_SERVER && PROXY_USERNAME && PROXY_PASSWORD) {
contextOptions.proxy = {
server: PROXY_SERVER,
username: PROXY_USERNAME,
password: PROXY_PASSWORD,
};
} else if (PROXY_SERVER) {
contextOptions.proxy = {
server: PROXY_SERVER,
};
}
context = await browser.newContext(contextOptions);
if (BLOCK_MEDIA) {
await context.route('**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}', async (route: Route, request: PlaywrightRequest) => {
await route.abort();
});
}
// Intercept all requests to avoid loading ads
await context.route('**/*', (route: Route, request: PlaywrightRequest) => {
const requestUrl = new URL(request.url());
const hostname = requestUrl.hostname;
if (AD_SERVING_DOMAINS.some(domain => hostname.includes(domain))) {
console.log(hostname);
return route.abort();
}
return route.continue();
});
};
const shutdownBrowser = async () => {
if (context) {
await context.close();
}
if (browser) {
await browser.close();
}
};
const isValidUrl = (urlString: string): boolean => {
try {
new URL(urlString);
return true;
} catch (_) {
return false;
}
};
const scrapePage = async (page: any, url: string, waitUntil: 'load' | 'networkidle', waitAfterLoad: number, timeout: number, checkSelector: string | undefined) => {
console.log(`Navigating to ${url} with waitUntil: ${waitUntil} and timeout: ${timeout}ms`);
const response = await page.goto(url, { waitUntil, timeout });
if (waitAfterLoad > 0) {
await page.waitForTimeout(waitAfterLoad);
}
if (checkSelector) {
try {
await page.waitForSelector(checkSelector, { timeout });
} catch (error) {
throw new Error('Required selector not found');
}
}
return {
content: await page.content(),
status: response ? response.status() : null,
};
};
app.post('/scrape', async (req: Request, res: Response) => {
const { url, wait_after_load = 0, timeout = 15000, headers, check_selector }: UrlModel = req.body;
console.log(`================= Scrape Request =================`);
console.log(`URL: ${url}`);
console.log(`Wait After Load: ${wait_after_load}`);
console.log(`Timeout: ${timeout}`);
console.log(`Headers: ${headers ? JSON.stringify(headers) : 'None'}`);
console.log(`Check Selector: ${check_selector ? check_selector : 'None'}`);
console.log(`==================================================`);
if (!url) {
return res.status(400).json({ error: 'URL is required' });
}
if (!isValidUrl(url)) {
return res.status(400).json({ error: 'Invalid URL' });
}
if (!PROXY_SERVER) {
console.warn('⚠️ WARNING: No proxy server provided. Your IP address may be blocked.');
}
if (!browser || !context) {
await initializeBrowser();
}
const page = await context.newPage();
// Set headers if provided
if (headers) {
await page.setExtraHTTPHeaders(headers);
}
let pageContent;
let pageStatusCode: number | null = null;
try {
// Strategy 1: Normal
console.log('Attempting strategy 1: Normal load');
const result = await scrapePage(page, url, 'load', wait_after_load, timeout, check_selector);
pageContent = result.content;
pageStatusCode = result.status;
} catch (error) {
console.log('Strategy 1 failed, attempting strategy 2: Wait until networkidle');
try {
// Strategy 2: Wait until networkidle
const result = await scrapePage(page, url, 'networkidle', wait_after_load, timeout, check_selector);
pageContent = result.content;
pageStatusCode = result.status;
} catch (finalError) {
await page.close();
return res.status(500).json({ error: 'An error occurred while fetching the page.' });
}
}
const pageError = pageStatusCode !== 200 ? getError(pageStatusCode) : false;
if (!pageError) {
console.log(`✅ Scrape successful!`);
} else {
console.log(`🚨 Scrape failed with status code: ${pageStatusCode} ${pageError}`);
}
await page.close();
res.json({
content: pageContent,
pageStatusCode,
pageError
});
});
app.listen(port, () => {
initializeBrowser().then(() => {
console.log(`Server is running on port ${port}`);
});
});
process.on('SIGINT', () => {
shutdownBrowser().then(() => {
console.log('Browser closed');
process.exit(0);
});
});