mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Nick: improvements to llm extract error handling
This commit is contained in:
parent
41eb620959
commit
e5ca4364ba
|
@ -1,15 +1,27 @@
|
||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { Logger } from '../../lib/logger';
|
import { Logger } from "../../lib/logger";
|
||||||
import { Document, legacyDocumentConverter, legacyExtractorOptions, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
|
import {
|
||||||
|
Document,
|
||||||
|
legacyDocumentConverter,
|
||||||
|
legacyExtractorOptions,
|
||||||
|
legacyScrapeOptions,
|
||||||
|
RequestWithAuth,
|
||||||
|
ScrapeRequest,
|
||||||
|
scrapeRequestSchema,
|
||||||
|
ScrapeResponse,
|
||||||
|
} from "./types";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { v4 as uuidv4 } from 'uuid';
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
|
|
||||||
export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
|
export async function scrapeController(
|
||||||
|
req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
|
||||||
|
res: Response<ScrapeResponse>
|
||||||
|
) {
|
||||||
req.body = scrapeRequestSchema.parse(req.body);
|
req.body = scrapeRequestSchema.parse(req.body);
|
||||||
let earlyReturn = false;
|
let earlyReturn = false;
|
||||||
|
|
||||||
|
@ -20,18 +32,27 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
|
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
const jobPriority = await getJobPriority({plan: req.auth.plan as PlanType, team_id: req.auth.team_id, basePriority: 10})
|
const jobPriority = await getJobPriority({
|
||||||
|
plan: req.auth.plan as PlanType,
|
||||||
const job = await addScrapeJob({
|
|
||||||
url: req.body.url,
|
|
||||||
mode: "single_urls",
|
|
||||||
crawlerOptions: {},
|
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
pageOptions,
|
basePriority: 10,
|
||||||
extractorOptions,
|
});
|
||||||
origin: req.body.origin,
|
|
||||||
is_scrape: true,
|
const job = await addScrapeJob(
|
||||||
}, {}, jobId, jobPriority);
|
{
|
||||||
|
url: req.body.url,
|
||||||
|
mode: "single_urls",
|
||||||
|
crawlerOptions: {},
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
pageOptions,
|
||||||
|
extractorOptions,
|
||||||
|
origin: req.body.origin,
|
||||||
|
is_scrape: true,
|
||||||
|
},
|
||||||
|
{},
|
||||||
|
jobId,
|
||||||
|
jobPriority
|
||||||
|
);
|
||||||
|
|
||||||
let doc: any | undefined;
|
let doc: any | undefined;
|
||||||
try {
|
try {
|
||||||
|
@ -46,7 +67,11 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
||||||
} else {
|
} else {
|
||||||
return res.status(500).json({
|
return res.status(500).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Internal server error",
|
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
||||||
|
extractorOptions && extractorOptions.mode !== "markdown"
|
||||||
|
? " - Could be due to LLM parsing issues"
|
||||||
|
: ""
|
||||||
|
}`,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -58,7 +83,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
warning: "No page found",
|
warning: "No page found",
|
||||||
data: doc
|
data: doc,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,7 +92,10 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
||||||
|
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0;
|
const numTokens =
|
||||||
|
doc && doc.markdown
|
||||||
|
? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||||
|
: 0;
|
||||||
|
|
||||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||||
if (earlyReturn) {
|
if (earlyReturn) {
|
||||||
|
@ -75,14 +103,12 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const billingResult = await billTeam(
|
const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled);
|
||||||
req.auth.team_id,
|
|
||||||
creditsToBeBilled
|
|
||||||
);
|
|
||||||
if (!billingResult.success) {
|
if (!billingResult.success) {
|
||||||
return res.status(402).json({
|
return res.status(402).json({
|
||||||
success: false,
|
success: false,
|
||||||
error: "Failed to bill team. Insufficient credits or subscription not found.",
|
error:
|
||||||
|
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,7 +124,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
crawlerOptions: {},
|
crawlerOptions: {},
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
origin: origin,
|
origin: origin,
|
||||||
extractor_options: { mode: "markdown" },
|
extractor_options: { mode: "markdown" },
|
||||||
num_tokens: numTokens,
|
num_tokens: numTokens,
|
||||||
});
|
});
|
||||||
|
@ -107,4 +133,4 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
|
||||||
success: true,
|
success: true,
|
||||||
data: legacyDocumentConverter(doc),
|
data: legacyDocumentConverter(doc),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,34 +25,35 @@ export async function generateCompletions(
|
||||||
switch (switchVariable) {
|
switch (switchVariable) {
|
||||||
case "openAI":
|
case "openAI":
|
||||||
const llm = new OpenAI();
|
const llm = new OpenAI();
|
||||||
try{
|
try {
|
||||||
|
const completionResult = await generateOpenAICompletions({
|
||||||
const completionResult = await generateOpenAICompletions({
|
client: llm,
|
||||||
client: llm,
|
document: document,
|
||||||
document: document,
|
schema: schema,
|
||||||
schema: schema,
|
prompt: prompt,
|
||||||
prompt: prompt,
|
systemPrompt: systemPrompt,
|
||||||
systemPrompt: systemPrompt,
|
mode: mode,
|
||||||
mode: mode,
|
});
|
||||||
});
|
// Validate the JSON output against the schema using AJV
|
||||||
// Validate the JSON output against the schema using AJV
|
if (schema) {
|
||||||
if(schema){
|
const validate = ajv.compile(schema);
|
||||||
const validate = ajv.compile(schema);
|
if (!validate(completionResult.llm_extraction)) {
|
||||||
if (!validate(completionResult.llm_extraction)) {
|
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
||||||
//TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
|
throw new Error(
|
||||||
throw new Error(
|
`JSON parsing error(s): ${validate.errors
|
||||||
`JSON parsing error(s): ${validate.errors
|
?.map((err) => err.message)
|
||||||
?.map((err) => err.message)
|
.join(
|
||||||
.join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
|
", "
|
||||||
);
|
)}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return completionResult;
|
return completionResult;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error generating completions: ${error}`);
|
Logger.error(`Error generating completions: ${error}`);
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
throw new Error("Invalid client");
|
throw new Error("Invalid client");
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,27 +76,6 @@ export async function generateOpenAICompletions({
|
||||||
let completion;
|
let completion;
|
||||||
let llmExtraction;
|
let llmExtraction;
|
||||||
if (prompt && !schema) {
|
if (prompt && !schema) {
|
||||||
// If prompt is defined, ask OpenAI to generate a schema based on the prompt
|
|
||||||
// const schemaCompletion = await openai.chat.completions.create({
|
|
||||||
// model,
|
|
||||||
// messages: [
|
|
||||||
// {
|
|
||||||
// role: "system",
|
|
||||||
// content: "You are a helpful assistant that generates JSON schemas based on user prompts.",
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// role: "user",
|
|
||||||
// content: `Generate a JSON schema compatible with openai function calling based on this prompt: ${prompt}`,
|
|
||||||
// },
|
|
||||||
// ],
|
|
||||||
// temperature: 0,
|
|
||||||
// response_format: { type: "json_object" },
|
|
||||||
// });
|
|
||||||
|
|
||||||
// console.log(schemaCompletion.choices[0].message.content);
|
|
||||||
|
|
||||||
// const generatedSchema = JSON.parse(schemaCompletion.choices[0].message.content);
|
|
||||||
console.log(prompt);
|
|
||||||
const jsonCompletion = await openai.chat.completions.create({
|
const jsonCompletion = await openai.chat.completions.create({
|
||||||
model,
|
model,
|
||||||
messages: [
|
messages: [
|
||||||
|
@ -105,16 +84,22 @@ export async function generateOpenAICompletions({
|
||||||
content: systemPrompt,
|
content: systemPrompt,
|
||||||
},
|
},
|
||||||
{ role: "user", content },
|
{ role: "user", content },
|
||||||
{ role: "user", content: `Transform the above content into structured json output based on the following user request: ${prompt}` },
|
{
|
||||||
|
role: "user",
|
||||||
|
content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
|
||||||
|
},
|
||||||
],
|
],
|
||||||
response_format: { type: "json_object" },
|
response_format: { type: "json_object" },
|
||||||
temperature,
|
temperature,
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(jsonCompletion.choices[0].message.content);
|
try {
|
||||||
|
llmExtraction = JSON.parse(
|
||||||
llmExtraction = JSON.parse(jsonCompletion.choices[0].message.content.trim());
|
jsonCompletion.choices[0].message.content.trim()
|
||||||
console.log(llmExtraction);
|
);
|
||||||
|
} catch (e) {
|
||||||
|
throw new Error("Invalid JSON");
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
completion = await openai.chat.completions.create({
|
completion = await openai.chat.completions.create({
|
||||||
model,
|
model,
|
||||||
|
@ -141,7 +126,11 @@ export async function generateOpenAICompletions({
|
||||||
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
const c = completion.choices[0].message.tool_calls[0].function.arguments;
|
||||||
|
|
||||||
// Extract the LLM extraction content from the completion response
|
// Extract the LLM extraction content from the completion response
|
||||||
llmExtraction = JSON.parse(c);
|
try {
|
||||||
|
llmExtraction = JSON.parse(c);
|
||||||
|
} catch (e) {
|
||||||
|
throw new Error("Invalid JSON");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the document with the LLM extraction content added
|
// Return the document with the LLM extraction content added
|
||||||
|
|
|
@ -19,3 +19,4 @@ export class CustomError extends Error {
|
||||||
Object.setPrototypeOf(this, CustomError.prototype);
|
Object.setPrototypeOf(this, CustomError.prototype);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -62,6 +62,7 @@ export function waitForJob(jobId: string, timeout: number) {
|
||||||
clearInterval(int);
|
clearInterval(int);
|
||||||
resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
|
resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
|
||||||
} else if (state === "failed") {
|
} else if (state === "failed") {
|
||||||
|
// console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason);
|
||||||
clearInterval(int);
|
clearInterval(int);
|
||||||
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
||||||
}
|
}
|
||||||
|
|
|
@ -192,6 +192,11 @@ async function processJob(job: Job, token: string) {
|
||||||
job,
|
job,
|
||||||
token,
|
token,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Better if we throw here so we capture with the correct error
|
||||||
|
if(!success) {
|
||||||
|
throw new Error(message);
|
||||||
|
}
|
||||||
const end = Date.now();
|
const end = Date.now();
|
||||||
const timeTakenInSeconds = (end - start) / 1000;
|
const timeTakenInSeconds = (end - start) / 1000;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user