This commit is contained in:
Nicolas 2024-07-12 22:02:08 -04:00
parent bfc7f5882e
commit e098e88ea7
4 changed files with 13 additions and 3 deletions

View File

@ -129,3 +129,11 @@ export interface FireEngineResponse {
pageError?: string;
}
export interface FireEngineOptions{
mobileProxy?: boolean;
method?: string;
engine?: string;
blockMedia?: boolean;
blockAds?: boolean;
}

View File

@ -8,7 +8,6 @@ import { scrapSingleUrl } from "./single_url";
import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
export class WebCrawler {
private initialUrl: string;

View File

@ -1,5 +1,5 @@
import axios from "axios";
import { FireEngineResponse } from "../../../lib/entities";
import { FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
@ -20,6 +20,7 @@ export async function scrapWithFireEngine({
waitFor = 0,
screenshot = false,
pageOptions = { parsePDF: true },
fireEngineOptions = {},
headers,
options,
}: {
@ -27,6 +28,7 @@ export async function scrapWithFireEngine({
waitFor?: number;
screenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
}): Promise<FireEngineResponse> {
@ -57,6 +59,7 @@ export async function scrapWithFireEngine({
screenshot: screenshotParam,
headers: headers,
pageOptions: pageOptions,
...fireEngineOptions,
},
{
headers: {

View File

@ -21,7 +21,7 @@ export async function getLinksFromSitemap(
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} else if (mode === 'fire-engine') {
const response = await scrapWithFireEngine({ url: sitemapUrl });
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } });
content = response.html;
}
} catch (error) {