feat(scrapeURL): add url-specific parameters
Some checks failed
STAGING Deploy Images to GHCR / push-app-image (push) Has been cancelled

This commit is contained in:
Gergő Móricz 2024-11-06 00:03:35 +01:00
parent e5385e62ee
commit 5e2124c6f9
2 changed files with 85 additions and 0 deletions

View File

@ -8,6 +8,7 @@ import { parseMarkdown } from "../../lib/html-to-markdown";
import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "./error";
import { executeTransformers } from "./transformers";
import { LLMRefusalError } from "./transformers/llmExtract";
import { urlSpecificParams } from "./lib/urlSpecificParams";
export type ScrapeUrlResponse = ({
success: true,
@ -74,6 +75,12 @@ function buildFeatureFlags(url: string, options: ScrapeOptions, internalOptions:
}
function buildMetaObject(id: string, url: string, options: ScrapeOptions, internalOptions: InternalOptions): Meta {
const specParams = urlSpecificParams[new URL(url).hostname.replace(/^www\./, "")];
if (specParams !== undefined) {
options = Object.assign(options, specParams.scrapeOptions);
internalOptions = Object.assign(internalOptions, specParams.internalOptions);
}
const _logger = logger.child({ module: "ScrapeURL", scrapeId: id });
const logs: any[] = [];
_logger.add(new ArrayTransport({ array: logs, scrapeId: id }));

View File

@ -0,0 +1,78 @@
import { InternalOptions } from "..";
import { ScrapeOptions } from "../../../controllers/v1/types";
export type UrlSpecificParams = {
scrapeOptions: Partial<ScrapeOptions>,
internalOptions: Partial<InternalOptions>,
};
const docsParam: UrlSpecificParams = {
scrapeOptions: {
waitFor: 2000,
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
internalOptions: { forceEngine: "fire-engine;chrome-cdp" },
}
export const urlSpecificParams: Record<string, UrlSpecificParams> = {
"support.greenpay.me": docsParam,
"docs.pdw.co": docsParam,
"developers.notion.com": docsParam,
"docs2.hubitat.com": docsParam,
"rsseau.fr": docsParam,
"help.salesforce.com": docsParam,
"scrapethissite.com": {
scrapeOptions: {
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
internalOptions: { forceEngine: "fetch" },
},
// "eonhealth.com": {
// defaultScraper: "fire-engine",
// params: {
// fireEngineOptions: {
// mobileProxy: true,
// method: "get",
// engine: "request",
// },
// },
// },
"notion.com": {
scrapeOptions: { waitFor: 2000 },
internalOptions: { forceEngine: "fire-engine;playwright" }
},
"developer.apple.com": {
scrapeOptions: { waitFor: 2000 },
internalOptions: { forceEngine: "fire-engine;playwright" }
},
"digikey.com": {
scrapeOptions: {},
internalOptions: { forceEngine: "fire-engine;tlsclient" }
},
"lorealparis.hu": {
scrapeOptions: {},
internalOptions: { forceEngine: "fire-engine;tlsclient" },
}
};