mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Merge branch 'mendableai:main' into feat/add-go-sdk
This commit is contained in:
commit
1fda882983
|
@ -84,6 +84,11 @@
|
|||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
|
@ -317,6 +322,11 @@
|
|||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
|
|
|
@ -7,6 +7,7 @@ export const defaultPageOptions = {
|
|||
includeHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
parsePDF: true
|
||||
};
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ export type PageOptions = {
|
|||
fetchPageContent?: boolean;
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
headers?: Record<string, string>;
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
parsePDF?: boolean;
|
||||
|
@ -42,8 +43,8 @@ export type SearchOptions = {
|
|||
|
||||
export type CrawlerOptions = {
|
||||
returnOnlyUrls?: boolean;
|
||||
includes?: string[];
|
||||
excludes?: string[];
|
||||
includes?: string | string[];
|
||||
excludes?: string | string[];
|
||||
maxCrawledLinks?: number;
|
||||
maxDepth?: number;
|
||||
limit?: number;
|
||||
|
|
|
@ -131,13 +131,13 @@ const saveJob = async (job: Job, result: any) => {
|
|||
|
||||
if (error) throw new Error(error.message);
|
||||
try {
|
||||
await job.moveToCompleted(null);
|
||||
await job.moveToCompleted(null, false, false);
|
||||
} catch (error) {
|
||||
// I think the job won't exist here anymore
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
await job.moveToCompleted(result);
|
||||
await job.moveToCompleted(result, false, false);
|
||||
} catch (error) {
|
||||
// I think the job won't exist here anymore
|
||||
}
|
||||
|
|
|
@ -27,8 +27,8 @@ export class WebScraperDataProvider {
|
|||
private bullJobId: string;
|
||||
private urls: string[] = [""];
|
||||
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
|
||||
private includes: string[];
|
||||
private excludes: string[];
|
||||
private includes: string | string[];
|
||||
private excludes: string | string[];
|
||||
private maxCrawledLinks: number;
|
||||
private maxCrawledDepth: number = 10;
|
||||
private returnOnlyUrls: boolean;
|
||||
|
@ -171,8 +171,8 @@ export class WebScraperDataProvider {
|
|||
const crawler = new WebCrawler({
|
||||
jobId: this.jobId,
|
||||
initialUrl: this.urls[0],
|
||||
includes: this.includes,
|
||||
excludes: this.excludes,
|
||||
includes: Array.isArray(this.includes) ? this.includes : this.includes.split(','),
|
||||
excludes: Array.isArray(this.excludes) ? this.excludes : this.excludes.split(','),
|
||||
maxCrawledLinks: this.maxCrawledLinks,
|
||||
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
|
||||
limit: this.limit,
|
||||
|
@ -445,6 +445,10 @@ export class WebScraperDataProvider {
|
|||
const url = new URL(document.metadata.sourceURL);
|
||||
const path = url.pathname;
|
||||
|
||||
if (!Array.isArray(this.excludes)) {
|
||||
this.excludes = this.excludes.split(',');
|
||||
}
|
||||
|
||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||
// Check if the link should be excluded
|
||||
if (
|
||||
|
@ -456,6 +460,10 @@ export class WebScraperDataProvider {
|
|||
}
|
||||
}
|
||||
|
||||
if (!Array.isArray(this.includes)) {
|
||||
this.includes = this.includes.split(',');
|
||||
}
|
||||
|
||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||
// Check if the link matches the include patterns, if any are specified
|
||||
if (this.includes.length > 0) {
|
||||
|
@ -567,8 +575,15 @@ export class WebScraperDataProvider {
|
|||
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
|
||||
false;
|
||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||
this.excludes = this.excludes.filter((item) => item !== "");
|
||||
|
||||
if (typeof options.crawlerOptions?.excludes === 'string') {
|
||||
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
|
||||
}
|
||||
|
||||
if (typeof options.crawlerOptions?.includes === 'string') {
|
||||
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
|
||||
}
|
||||
|
||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
|
||||
this.allowBackwardCrawling =
|
||||
|
|
|
@ -11,6 +11,7 @@ import { Logger } from "../../../lib/logger";
|
|||
* @param url The URL to scrape
|
||||
* @param waitFor The time to wait for the page to load
|
||||
* @param screenshot Whether to take a screenshot
|
||||
* @param fullPageScreenshot Whether to take a full page screenshot
|
||||
* @param pageOptions The options for the page
|
||||
* @param headers The headers to send with the request
|
||||
* @param options The options for the request
|
||||
|
@ -20,6 +21,7 @@ export async function scrapWithFireEngine({
|
|||
url,
|
||||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
|
@ -28,6 +30,7 @@ export async function scrapWithFireEngine({
|
|||
url: string;
|
||||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
|
@ -49,6 +52,7 @@ export async function scrapWithFireEngine({
|
|||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
|
||||
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
||||
|
||||
|
@ -61,7 +65,7 @@ export async function scrapWithFireEngine({
|
|||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||
|
||||
Logger.info(
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
|
||||
);
|
||||
|
||||
|
||||
|
@ -71,6 +75,7 @@ export async function scrapWithFireEngine({
|
|||
url: url,
|
||||
wait: waitParam,
|
||||
screenshot: screenshotParam,
|
||||
fullPageScreenshot: fullPageScreenshotParam,
|
||||
headers: headers,
|
||||
pageOptions: pageOptions,
|
||||
...fireEngineOptionsParam,
|
||||
|
|
|
@ -128,6 +128,7 @@ export async function scrapSingleUrl(
|
|||
includeRawHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
headers: undefined,
|
||||
},
|
||||
extractorOptions: ExtractorOptions = {
|
||||
|
@ -171,6 +172,7 @@ export async function scrapSingleUrl(
|
|||
url,
|
||||
waitFor: pageOptions.waitFor,
|
||||
screenshot: pageOptions.screenshot,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot,
|
||||
pageOptions: pageOptions,
|
||||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
|
@ -306,7 +308,7 @@ export async function scrapSingleUrl(
|
|||
const scrapersInOrder = getScrapingFallbackOrder(
|
||||
defaultScraper,
|
||||
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
|
||||
pageOptions && pageOptions.screenshot && pageOptions.screenshot === true,
|
||||
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
|
||||
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
|
||||
);
|
||||
|
||||
|
|
|
@ -240,4 +240,12 @@ export const urlSpecificParams = {
|
|||
},
|
||||
},
|
||||
},
|
||||
"digikey.com":{
|
||||
defaultScraper: "fire-engine",
|
||||
params:{
|
||||
fireEngineOptions:{
|
||||
engine: "tlsclient",
|
||||
},
|
||||
},
|
||||
}
|
||||
};
|
||||
|
|
|
@ -44,9 +44,9 @@ export async function logScrape(
|
|||
]);
|
||||
|
||||
if (error) {
|
||||
Logger.error(`Error logging proxy:\n${error}`);
|
||||
Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error logging proxy:\n${error}`);
|
||||
Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ export function getWebScraperQueue() {
|
|||
maxStalledCount: 10,
|
||||
},
|
||||
defaultJobOptions:{
|
||||
attempts: 5
|
||||
attempts: 2
|
||||
}
|
||||
});
|
||||
Logger.info("Web scraper queue created");
|
||||
|
|
|
@ -20,7 +20,7 @@ if (process.env.ENV === 'production') {
|
|||
const wsq = getWebScraperQueue();
|
||||
|
||||
async function processJob(job: Job, done) {
|
||||
Logger.debug(`🐂 Worker taking job ${job.id}`);
|
||||
Logger.info(`🐂 Worker taking job ${job.id}`);
|
||||
|
||||
try {
|
||||
job.progress({
|
||||
|
@ -61,7 +61,7 @@ async function processJob(job: Job, done) {
|
|||
pageOptions: job.data.pageOptions,
|
||||
origin: job.data.origin,
|
||||
});
|
||||
Logger.debug(`🐂 Job done ${job.id}`);
|
||||
Logger.info(`🐂 Job done ${job.id}`);
|
||||
done(null, data);
|
||||
} catch (error) {
|
||||
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
|
||||
|
|
|
@ -36,17 +36,9 @@ export const supabase_service: SupabaseClient = new Proxy(
|
|||
new SupabaseService(),
|
||||
{
|
||||
get: function (target, prop, receiver) {
|
||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
||||
Logger.debug(
|
||||
"Attempted to access Supabase client when it's not configured."
|
||||
);
|
||||
}
|
||||
const client = target.getClient();
|
||||
// If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
|
||||
if (client === null) {
|
||||
Logger.error(
|
||||
"Attempted to access Supabase client when it's not configured."
|
||||
);
|
||||
return () => {
|
||||
throw new Error("Supabase client is not configured.");
|
||||
};
|
||||
|
|
2
apps/js-sdk/firecrawl/.gitignore
vendored
2
apps/js-sdk/firecrawl/.gitignore
vendored
|
@ -128,3 +128,5 @@ dist
|
|||
.yarn/build-state.yml
|
||||
.yarn/install-state.gz
|
||||
.pnp.*
|
||||
|
||||
build
|
||||
|
|
271
apps/js-sdk/firecrawl/build/cjs/index.js
Normal file
271
apps/js-sdk/firecrawl/build/cjs/index.js
Normal file
|
@ -0,0 +1,271 @@
|
|||
"use strict";
|
||||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
||||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
||||
return new (P || (P = Promise))(function (resolve, reject) {
|
||||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
||||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
||||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
||||
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
||||
});
|
||||
};
|
||||
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||||
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||||
};
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
const axios_1 = __importDefault(require("axios"));
|
||||
const zod_1 = require("zod");
|
||||
const zod_to_json_schema_1 = require("zod-to-json-schema");
|
||||
/**
|
||||
* Main class for interacting with the Firecrawl API.
|
||||
*/
|
||||
class FirecrawlApp {
|
||||
/**
|
||||
* Initializes a new instance of the FirecrawlApp class.
|
||||
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
|
||||
*/
|
||||
constructor({ apiKey = null, apiUrl = null }) {
|
||||
this.apiKey = apiKey || "";
|
||||
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
||||
if (!this.apiKey) {
|
||||
throw new Error("No API key provided");
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Scrapes a URL using the Firecrawl API.
|
||||
* @param {string} url - The URL to scrape.
|
||||
* @param {Params | null} params - Additional parameters for the scrape request.
|
||||
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
|
||||
*/
|
||||
scrapeUrl(url, params = null) {
|
||||
var _a;
|
||||
return __awaiter(this, void 0, void 0, function* () {
|
||||
const headers = {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
};
|
||||
let jsonData = Object.assign({ url }, params);
|
||||
if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) {
|
||||
let schema = params.extractorOptions.extractionSchema;
|
||||
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
||||
if (schema instanceof zod_1.z.ZodSchema) {
|
||||
schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema);
|
||||
}
|
||||
jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) });
|
||||
}
|
||||
try {
|
||||
const response = yield axios_1.default.post(this.apiUrl + "/v0/scrape", jsonData, { headers });
|
||||
if (response.status === 200) {
|
||||
const responseData = response.data;
|
||||
if (responseData.success) {
|
||||
return responseData;
|
||||
}
|
||||
else {
|
||||
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
||||
}
|
||||
}
|
||||
else {
|
||||
this.handleError(response, "scrape URL");
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Searches for a query using the Firecrawl API.
|
||||
* @param {string} query - The query to search for.
|
||||
* @param {Params | null} params - Additional parameters for the search request.
|
||||
* @returns {Promise<SearchResponse>} The response from the search operation.
|
||||
*/
|
||||
search(query, params = null) {
|
||||
return __awaiter(this, void 0, void 0, function* () {
|
||||
const headers = {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
};
|
||||
let jsonData = { query };
|
||||
if (params) {
|
||||
jsonData = Object.assign(Object.assign({}, jsonData), params);
|
||||
}
|
||||
try {
|
||||
const response = yield axios_1.default.post(this.apiUrl + "/v0/search", jsonData, { headers });
|
||||
if (response.status === 200) {
|
||||
const responseData = response.data;
|
||||
if (responseData.success) {
|
||||
return responseData;
|
||||
}
|
||||
else {
|
||||
throw new Error(`Failed to search. Error: ${responseData.error}`);
|
||||
}
|
||||
}
|
||||
else {
|
||||
this.handleError(response, "search");
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Initiates a crawl job for a URL using the Firecrawl API.
|
||||
* @param {string} url - The URL to crawl.
|
||||
* @param {Params | null} params - Additional parameters for the crawl request.
|
||||
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
||||
* @param {number} pollInterval - Time in seconds for job status checks.
|
||||
* @param {string} idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
|
||||
*/
|
||||
crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) {
|
||||
return __awaiter(this, void 0, void 0, function* () {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData = { url };
|
||||
if (params) {
|
||||
jsonData = Object.assign(Object.assign({}, jsonData), params);
|
||||
}
|
||||
try {
|
||||
const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers);
|
||||
if (response.status === 200) {
|
||||
const jobId = response.data.jobId;
|
||||
if (waitUntilDone) {
|
||||
return this.monitorJobStatus(jobId, headers, pollInterval);
|
||||
}
|
||||
else {
|
||||
return { success: true, jobId };
|
||||
}
|
||||
}
|
||||
else {
|
||||
this.handleError(response, "start crawl job");
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Checks the status of a crawl job using the Firecrawl API.
|
||||
* @param {string} jobId - The job ID of the crawl operation.
|
||||
* @returns {Promise<JobStatusResponse>} The response containing the job status.
|
||||
*/
|
||||
checkCrawlStatus(jobId) {
|
||||
return __awaiter(this, void 0, void 0, function* () {
|
||||
const headers = this.prepareHeaders();
|
||||
try {
|
||||
const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers);
|
||||
if (response.status === 200) {
|
||||
return {
|
||||
success: true,
|
||||
status: response.data.status,
|
||||
current: response.data.current,
|
||||
current_url: response.data.current_url,
|
||||
current_step: response.data.current_step,
|
||||
total: response.data.total,
|
||||
data: response.data.data,
|
||||
partial_data: !response.data.data
|
||||
? response.data.partial_data
|
||||
: undefined,
|
||||
};
|
||||
}
|
||||
else {
|
||||
this.handleError(response, "check crawl status");
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return {
|
||||
success: false,
|
||||
status: "unknown",
|
||||
current: 0,
|
||||
current_url: "",
|
||||
current_step: "",
|
||||
total: 0,
|
||||
error: "Internal server error.",
|
||||
};
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Prepares the headers for an API request.
|
||||
* @returns {AxiosRequestHeaders} The prepared headers.
|
||||
*/
|
||||
prepareHeaders(idempotencyKey) {
|
||||
return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}));
|
||||
}
|
||||
/**
|
||||
* Sends a POST request to the specified URL.
|
||||
* @param {string} url - The URL to send the request to.
|
||||
* @param {Params} data - The data to send in the request.
|
||||
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||
* @returns {Promise<AxiosResponse>} The response from the POST request.
|
||||
*/
|
||||
postRequest(url, data, headers) {
|
||||
return axios_1.default.post(url, data, { headers });
|
||||
}
|
||||
/**
|
||||
* Sends a GET request to the specified URL.
|
||||
* @param {string} url - The URL to send the request to.
|
||||
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||
* @returns {Promise<AxiosResponse>} The response from the GET request.
|
||||
*/
|
||||
getRequest(url, headers) {
|
||||
return axios_1.default.get(url, { headers });
|
||||
}
|
||||
/**
|
||||
* Monitors the status of a crawl job until completion or failure.
|
||||
* @param {string} jobId - The job ID of the crawl operation.
|
||||
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||
* @returns {Promise<any>} The final job status or data.
|
||||
*/
|
||||
monitorJobStatus(jobId, headers, checkInterval) {
|
||||
return __awaiter(this, void 0, void 0, function* () {
|
||||
while (true) {
|
||||
const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers);
|
||||
if (statusResponse.status === 200) {
|
||||
const statusData = statusResponse.data;
|
||||
if (statusData.status === "completed") {
|
||||
if ("data" in statusData) {
|
||||
return statusData.data;
|
||||
}
|
||||
else {
|
||||
throw new Error("Crawl job completed but no data was returned");
|
||||
}
|
||||
}
|
||||
else if (["active", "paused", "pending", "queued"].includes(statusData.status)) {
|
||||
if (checkInterval < 2) {
|
||||
checkInterval = 2;
|
||||
}
|
||||
yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again
|
||||
}
|
||||
else {
|
||||
throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
|
||||
}
|
||||
}
|
||||
else {
|
||||
this.handleError(statusResponse, "check crawl status");
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Handles errors from API responses.
|
||||
* @param {AxiosResponse} response - The response from the API.
|
||||
* @param {string} action - The action being performed when the error occurred.
|
||||
*/
|
||||
handleError(response, action) {
|
||||
if ([402, 408, 409, 500].includes(response.status)) {
|
||||
const errorMessage = response.data.error || "Unknown error occurred";
|
||||
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
|
||||
}
|
||||
else {
|
||||
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
exports.default = FirecrawlApp;
|
1
apps/js-sdk/firecrawl/build/cjs/package.json
Normal file
1
apps/js-sdk/firecrawl/build/cjs/package.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"type": "commonjs"}
|
1
apps/js-sdk/firecrawl/build/esm/package.json
Normal file
1
apps/js-sdk/firecrawl/build/esm/package.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"type": "module"}
|
4
apps/js-sdk/firecrawl/package-lock.json
generated
4
apps/js-sdk/firecrawl/package-lock.json
generated
|
@ -1,12 +1,12 @@
|
|||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "0.0.29",
|
||||
"version": "0.0.34",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "0.0.29",
|
||||
"version": "0.0.34",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "0.0.29",
|
||||
"version": "0.0.35",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "build/index.js",
|
||||
"main": "build/cjs/index.js",
|
||||
"types": "types/index.d.ts",
|
||||
"type": "module",
|
||||
"exports": {
|
||||
"require": "./build/cjs/index.js",
|
||||
"import": "./build/esm/index.js"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json",
|
||||
"build-and-publish": "npm run build && npm publish --access public",
|
||||
"publish-beta": "npm run build && npm publish --access public --tag beta",
|
||||
"test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts"
|
||||
|
|
22
apps/js-sdk/firecrawl/types/index.d.ts
vendored
22
apps/js-sdk/firecrawl/types/index.d.ts
vendored
|
@ -73,16 +73,16 @@ export interface ScrapeResponse {
|
|||
error?: string;
|
||||
}
|
||||
/**
|
||||
* Response interface for searching operations.
|
||||
*/
|
||||
* Response interface for searching operations.
|
||||
*/
|
||||
export interface SearchResponse {
|
||||
success: boolean;
|
||||
data?: FirecrawlDocument[];
|
||||
error?: string;
|
||||
}
|
||||
/**
|
||||
* Response interface for crawling operations.
|
||||
*/
|
||||
* Response interface for crawling operations.
|
||||
*/
|
||||
export interface CrawlResponse {
|
||||
success: boolean;
|
||||
jobId?: string;
|
||||
|
@ -90,24 +90,28 @@ export interface CrawlResponse {
|
|||
error?: string;
|
||||
}
|
||||
/**
|
||||
* Response interface for job status checks.
|
||||
*/
|
||||
* Response interface for job status checks.
|
||||
*/
|
||||
export interface JobStatusResponse {
|
||||
success: boolean;
|
||||
status: string;
|
||||
current?: number;
|
||||
current_url?: string;
|
||||
current_step?: string;
|
||||
total?: number;
|
||||
jobId?: string;
|
||||
data?: FirecrawlDocument[];
|
||||
partial_data?: FirecrawlDocument[];
|
||||
error?: string;
|
||||
}
|
||||
/**
|
||||
* Generic parameter interface.
|
||||
*/
|
||||
* Generic parameter interface.
|
||||
*/
|
||||
export interface Params {
|
||||
[key: string]: any;
|
||||
extractorOptions?: {
|
||||
extractionSchema: z.ZodSchema | any;
|
||||
mode?: "llm-extraction" | "llm-extraction-from-raw-html";
|
||||
mode?: "llm-extraction";
|
||||
extractionPrompt?: string;
|
||||
};
|
||||
}
|
||||
|
|
34
apps/js-sdk/package-lock.json
generated
34
apps/js-sdk/package-lock.json
generated
|
@ -13,6 +13,7 @@
|
|||
"axios": "^1.6.8",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.4.5",
|
||||
"uuid": "^10.0.0",
|
||||
"zod": "^3.23.8"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
@ -450,6 +451,15 @@
|
|||
"resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz",
|
||||
"integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA=="
|
||||
},
|
||||
"node_modules/@types/node": {
|
||||
"version": "20.14.11",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.11.tgz",
|
||||
"integrity": "sha512-kprQpL8MMeszbz6ojB5/tU8PLN4kesnN8Gjzw349rDlNgsSzg90lAVj3llK99Dh7JON+t9AuscPPFW6mPbTnSA==",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~5.26.4"
|
||||
}
|
||||
},
|
||||
"node_modules/acorn": {
|
||||
"version": "8.11.3",
|
||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz",
|
||||
|
@ -728,6 +738,24 @@
|
|||
"node": ">=14.17"
|
||||
}
|
||||
},
|
||||
"node_modules/undici-types": {
|
||||
"version": "5.26.5",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
|
||||
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/uuid": {
|
||||
"version": "10.0.0",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz",
|
||||
"integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==",
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
"https://github.com/sponsors/ctavan"
|
||||
],
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/v8-compile-cache-lib": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
|
||||
|
@ -750,9 +778,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/zod-to-json-schema": {
|
||||
"version": "3.23.0",
|
||||
"resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz",
|
||||
"integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==",
|
||||
"version": "3.23.1",
|
||||
"resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.1.tgz",
|
||||
"integrity": "sha512-oT9INvydob1XV0v1d2IadrR74rLtDInLvDFfAa1CG0Pmg/vxATk7I2gSelfj271mbzeM4Da0uuDQE/Nkj3DWNw==",
|
||||
"peerDependencies": {
|
||||
"zod": "^3.23.3"
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
"axios": "^1.6.8",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.4.5",
|
||||
"uuid": "^10.0.0",
|
||||
"zod": "^3.23.8"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
|
|
@ -29,6 +29,7 @@ x-common-service: &common-service
|
|||
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
|
||||
- HOST=${HOST:-0.0.0.0}
|
||||
- SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL}
|
||||
- LOGGING_LEVEL=${LOGGING_LEVEL}
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user