Nick: improvements to llm extract error handling

2024-11-16 11:42:24 +08:00 · 2024-08-30 11:57:55 -03:00 · 2024-08-30 11:57:55 -03:00 · e5ca4364ba
commit e5ca4364ba
parent 41eb620959
6 changed files with 101 additions and 78 deletions
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -1,15 +1,27 @@
 import { Request, Response } from "express";
-import { Logger } from '../../lib/logger';
-import { Document, legacyDocumentConverter, legacyExtractorOptions, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
+import { Logger } from "../../lib/logger";
+import {
+  Document,
+  legacyDocumentConverter,
+  legacyExtractorOptions,
+  legacyScrapeOptions,
+  RequestWithAuth,
+  ScrapeRequest,
+  scrapeRequestSchema,
+  ScrapeResponse,
+} from "./types";
 import { billTeam } from "../../services/billing/credit_billing";
-import { v4 as uuidv4 } from 'uuid';
+import { v4 as uuidv4 } from "uuid";
 import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
 import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
 import { logJob } from "../../services/logging/log_job";
 import { getJobPriority } from "../../lib/job-priority";
 import { PlanType } from "../../types";

-export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
+export async function scrapeController(
+  req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>,
+  res: Response<ScrapeResponse>
+) {
  req.body = scrapeRequestSchema.parse(req.body);
  let earlyReturn = false;

@ -20,18 +32,27 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
  const jobId = uuidv4();

  const startTime = new Date().getTime();
-  const jobPriority = await getJobPriority({plan: req.auth.plan as PlanType, team_id: req.auth.team_id, basePriority: 10})
-
-  const job = await addScrapeJob({
-    url: req.body.url,
-    mode: "single_urls",
-    crawlerOptions: {},
+  const jobPriority = await getJobPriority({
+    plan: req.auth.plan as PlanType,
    team_id: req.auth.team_id,
-    pageOptions,
-    extractorOptions,
-    origin: req.body.origin,
-    is_scrape: true,
-  }, {}, jobId, jobPriority);
+    basePriority: 10,
+  });
+
+  const job = await addScrapeJob(
+    {
+      url: req.body.url,
+      mode: "single_urls",
+      crawlerOptions: {},
+      team_id: req.auth.team_id,
+      pageOptions,
+      extractorOptions,
+      origin: req.body.origin,
+      is_scrape: true,
+    },
+    {},
+    jobId,
+    jobPriority
+  );

  let doc: any | undefined;
  try {
@ -46,7 +67,11 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
    } else {
      return res.status(500).json({
        success: false,
-        error: "Internal server error",
+        error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
+          extractorOptions && extractorOptions.mode !== "markdown"
+            ? " - Could be due to LLM parsing issues"
+            : ""
+        }`,
      });
    }
  }
@ -58,7 +83,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
    return res.status(200).json({
      success: true,
      warning: "No page found",
-      data: doc
+      data: doc,
    });
  }

@ -67,7 +92,10 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,

  const endTime = new Date().getTime();
  const timeTakenInSeconds = (endTime - startTime) / 1000;
-  const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0;
+  const numTokens =
+    doc && doc.markdown
+      ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
+      : 0;

  let creditsToBeBilled = 1; // Assuming 1 credit per document
  if (earlyReturn) {
@ -75,14 +103,12 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
    return;
  }

-  const billingResult = await billTeam(
-    req.auth.team_id,
-    creditsToBeBilled
-  );
+  const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled);
  if (!billingResult.success) {
    return res.status(402).json({
      success: false,
-      error: "Failed to bill team. Insufficient credits or subscription not found.",
+      error:
+        "Failed to bill team. Insufficient credits or subscription not found.",
    });
  }

@ -98,7 +124,7 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
    url: req.body.url,
    crawlerOptions: {},
    pageOptions: pageOptions,
-    origin: origin, 
+    origin: origin,
    extractor_options: { mode: "markdown" },
    num_tokens: numTokens,
  });
@ -107,4 +133,4 @@ export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse,
    success: true,
    data: legacyDocumentConverter(doc),
  });
-}
+}
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@ -25,34 +25,35 @@ export async function generateCompletions(
      switch (switchVariable) {
        case "openAI":
          const llm = new OpenAI();
-          try{
-
-          const completionResult = await generateOpenAICompletions({
-            client: llm,
-            document: document,
-            schema: schema,
-            prompt: prompt,
-            systemPrompt: systemPrompt,
-            mode: mode,
-          });
-          // Validate the JSON output against the schema using AJV
-          if(schema){
-          const validate = ajv.compile(schema);
-          if (!validate(completionResult.llm_extraction)) {
-            //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
-            throw new Error(
-              `JSON parsing error(s): ${validate.errors
-                ?.map((err) => err.message)
-                .join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
-              );
+          try {
+            const completionResult = await generateOpenAICompletions({
+              client: llm,
+              document: document,
+              schema: schema,
+              prompt: prompt,
+              systemPrompt: systemPrompt,
+              mode: mode,
+            });
+            // Validate the JSON output against the schema using AJV
+            if (schema) {
+              const validate = ajv.compile(schema);
+              if (!validate(completionResult.llm_extraction)) {
+                //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc.
+                throw new Error(
+                  `JSON parsing error(s): ${validate.errors
+                    ?.map((err) => err.message)
+                    .join(
+                      ", "
+                    )}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.`
+                );
+              }
            }
-          }

-          return completionResult;
-        } catch (error) {
-          Logger.error(`Error generating completions: ${error}`);
-          throw error;
-        }
+            return completionResult;
+          } catch (error) {
+            Logger.error(`Error generating completions: ${error}`);
+            throw error;
+          }
        default:
          throw new Error("Invalid client");
      }
--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@ -76,27 +76,6 @@ export async function generateOpenAICompletions({
  let completion;
  let llmExtraction;
  if (prompt && !schema) {
-    // If prompt is defined, ask OpenAI to generate a schema based on the prompt
-    //   const schemaCompletion = await openai.chat.completions.create({
-    //     model,
-    //     messages: [
-    //       {
-    //         role: "system",
-    //         content: "You are a helpful assistant that generates JSON schemas based on user prompts.",
-    //       },
-    //       {
-    //         role: "user",
-    //         content: `Generate a JSON schema compatible with openai function calling based on this prompt: ${prompt}`,
-    //       },
-    //     ],
-    //     temperature: 0,
-    //     response_format: { type: "json_object" },
-    // });
-
-    // console.log(schemaCompletion.choices[0].message.content);
-
-    // const generatedSchema = JSON.parse(schemaCompletion.choices[0].message.content);
-    console.log(prompt);
    const jsonCompletion = await openai.chat.completions.create({
      model,
      messages: [
@ -105,16 +84,22 @@ export async function generateOpenAICompletions({
          content: systemPrompt,
        },
        { role: "user", content },
-        { role: "user", content: `Transform the above content into structured json output based on the following user request: ${prompt}` },
+        {
+          role: "user",
+          content: `Transform the above content into structured json output based on the following user request: ${prompt}`,
+        },
      ],
      response_format: { type: "json_object" },
      temperature,
    });

-    console.log(jsonCompletion.choices[0].message.content);
-
-    llmExtraction = JSON.parse(jsonCompletion.choices[0].message.content.trim());
-    console.log(llmExtraction);
+    try {
+      llmExtraction = JSON.parse(
+        jsonCompletion.choices[0].message.content.trim()
+      );
+    } catch (e) {
+      throw new Error("Invalid JSON");
+    }
  } else {
    completion = await openai.chat.completions.create({
      model,
@ -141,7 +126,11 @@ export async function generateOpenAICompletions({
    const c = completion.choices[0].message.tool_calls[0].function.arguments;

    // Extract the LLM extraction content from the completion response
-    llmExtraction = JSON.parse(c);
+    try {
+      llmExtraction = JSON.parse(c);
+    } catch (e) {
+      throw new Error("Invalid JSON");
+    }
  }

  // Return the document with the LLM extraction content added
--- a/apps/api/src/lib/custom-error.ts
+++ b/apps/api/src/lib/custom-error.ts
@ -19,3 +19,4 @@ export class CustomError extends Error {
    Object.setPrototypeOf(this, CustomError.prototype);
  }
 }
+
--- a/apps/api/src/services/queue-jobs.ts
+++ b/apps/api/src/services/queue-jobs.ts
@ -62,6 +62,7 @@ export function waitForJob(jobId: string, timeout: number) {
          clearInterval(int);
          resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
        } else if (state === "failed") {
+          // console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason);
          clearInterval(int);
          reject((await getScrapeQueue().getJob(jobId)).failedReason);
        }
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -192,6 +192,11 @@ async function processJob(job: Job, token: string) {
      job,
      token,
    });
+
+    // Better if we throw here so we capture with the correct error
+    if(!success) {
+      throw new Error(message);
+    }
    const end = Date.now();
    const timeTakenInSeconds = (end - start) / 1000;