Merge pull request #784 from mendableai/nsc/geolocation
Some checks are pending
Deploy Images to GHCR / push-app-image (push) Waiting to run

Geolocation support for Firecrawl
This commit is contained in:
Nicolas 2024-10-15 21:49:16 -03:00 committed by GitHub
commit ffbf16048f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 2279 additions and 2 deletions

View File

@ -4,6 +4,7 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types";
import { countries } from "../../lib/validate-country";
export type Format =
| "markdown"
@ -108,6 +109,14 @@ export const scrapeOptions = z.object({
extract: extractOptions.optional(),
parsePDF: z.boolean().default(true),
actions: actionsSchema.optional(),
geolocation: z.object({
country: z.string().optional().refine(
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
{
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
}
).transform(val => val ? val.toUpperCase() : 'US')
}).optional(),
}).strict(strictMessage)
@ -421,6 +430,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery
geolocation: x.geolocation,
};
}

View File

@ -51,6 +51,9 @@ export type PageOptions = {
disableJsDom?: boolean; // beta
atsv?: boolean; // anti-bot solver, beta
actions?: Action[]; // beta
geolocation?: {
country?: string;
};
};
export type ExtractorOptions = {

File diff suppressed because it is too large Load Diff

View File

@ -593,6 +593,7 @@ export class WebScraperDataProvider {
disableJsDom: options.pageOptions?.disableJsDom ?? false,
atsv: options.pageOptions?.atsv ?? false,
actions: options.pageOptions?.actions ?? undefined,
geolocation: options.pageOptions?.geolocation ?? undefined,
};
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths =

View File

@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
waitFor = 0,
screenshot = false,
fullPageScreenshot = false,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" } },
fireEngineOptions = {},
headers,
options,
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
waitFor?: number;
screenshot?: boolean;
fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string } };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
@ -118,6 +118,7 @@ export async function scrapWithFireEngine({
...fireEngineOptionsParam,
atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [],
geolocation: pageOptions?.geolocation,
actions: actions,
},
{

View File

@ -156,6 +156,7 @@ export async function scrapSingleUrl(
disableJsDom: pageOptions.disableJsDom ?? false,
atsv: pageOptions.atsv ?? false,
actions: pageOptions.actions ?? undefined,
geolocation: pageOptions.geolocation ?? undefined,
}
if (extractorOptions) {