mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Merge pull request #321 from mendableai/bug/fix-issue-310
[Bug] Added default values and fixed pdf bug
This commit is contained in:
commit
ac08e20c33
|
@ -9,6 +9,7 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
|||
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
||||
|
@ -56,15 +57,8 @@ export async function crawlController(req: Request, res: Response) {
|
|||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {
|
||||
allowBackwardCrawling: false
|
||||
};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
removeTags: [],
|
||||
parsePDF: true
|
||||
};
|
||||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
if (mode === "single_urls" && !url.includes(",")) {
|
||||
try {
|
||||
|
@ -100,7 +94,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? "api",
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
});
|
||||
|
||||
await logCrawl(job.id.toString(), team_id);
|
||||
|
|
|
@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job";
|
|||
import { Document } from "../lib/entities";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||
|
||||
export async function scrapeHelper(
|
||||
req: Request,
|
||||
|
@ -105,21 +106,13 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
return res.status(status).json({ error });
|
||||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
parsePDF: true
|
||||
};
|
||||
const extractorOptions = req.body.extractorOptions ?? {
|
||||
mode: "markdown"
|
||||
}
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||
if (extractorOptions.mode === "llm-extraction") {
|
||||
pageOptions.onlyMainContent = true;
|
||||
}
|
||||
const origin = req.body.origin ?? "api";
|
||||
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds
|
||||
const origin = req.body.origin ?? defaultOrigin;
|
||||
const timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
|
|
26
apps/api/src/lib/default-values.ts
Normal file
26
apps/api/src/lib/default-values.ts
Normal file
|
@ -0,0 +1,26 @@
|
|||
export const defaultOrigin = "api";
|
||||
|
||||
export const defaultTimeout = 30000; // 30 seconds
|
||||
|
||||
export const defaultPageOptions = {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
parsePDF: true
|
||||
};
|
||||
|
||||
export const defaultCrawlerOptions = {
|
||||
allowBackwardCrawling: false
|
||||
}
|
||||
|
||||
export const defaultCrawlPageOptions = {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
removeTags: [],
|
||||
parsePDF: true
|
||||
}
|
||||
|
||||
export const defaultExtractorOptions = {
|
||||
mode: "markdown"
|
||||
}
|
Loading…
Reference in New Issue
Block a user