Merge pull request #321 from mendableai/bug/fix-issue-310

[Bug] Added default values and fixed pdf bug
This commit is contained in:
Nicolas 2024-06-26 11:50:42 -03:00 committed by GitHub
commit ac08e20c33
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 35 additions and 22 deletions

View File

@ -9,6 +9,7 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
export async function crawlController(req: Request, res: Response) {
try {
@ -56,15 +57,8 @@ export async function crawlController(req: Request, res: Response) {
const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {
allowBackwardCrawling: false
};
const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
removeTags: [],
parsePDF: true
};
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
if (mode === "single_urls" && !url.includes(",")) {
try {
@ -100,7 +94,7 @@ export async function crawlController(req: Request, res: Response) {
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? "api",
origin: req.body.origin ?? defaultOrigin,
});
await logCrawl(job.id.toString(), team_id);

View File

@ -8,6 +8,7 @@ import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
export async function scrapeHelper(
req: Request,
@ -105,21 +106,13 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(status).json({ error });
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? {
onlyMainContent: false,
includeHtml: false,
waitFor: 0,
screenshot: false,
parsePDF: true
};
const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown"
}
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
if (extractorOptions.mode === "llm-extraction") {
pageOptions.onlyMainContent = true;
}
const origin = req.body.origin ?? "api";
const timeout = req.body.timeout ?? 30000; // Default timeout of 30 seconds
const origin = req.body.origin ?? defaultOrigin;
const timeout = req.body.timeout ?? defaultTimeout;
try {
const { success: creditsCheckSuccess, message: creditsCheckMessage } =

View File

@ -0,0 +1,26 @@
export const defaultOrigin = "api";
export const defaultTimeout = 30000; // 30 seconds
export const defaultPageOptions = {
onlyMainContent: false,
includeHtml: false,
waitFor: 0,
screenshot: false,
parsePDF: true
};
export const defaultCrawlerOptions = {
allowBackwardCrawling: false
}
export const defaultCrawlPageOptions = {
onlyMainContent: false,
includeHtml: false,
removeTags: [],
parsePDF: true
}
export const defaultExtractorOptions = {
mode: "markdown"
}