diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index d15bd677..738c8380 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -18,6 +18,7 @@ export type ScrapeUrlResponse = ({ error: any, }) & { logs: any[], + engines: EngineResultsTracker, } export type Meta = { @@ -116,7 +117,7 @@ export type InternalOptions = { v0DisableJsDom?: boolean; }; -export type EngineResultsTracker = { [E in Engine]?: { +export type EngineResultsTracker = { [E in Engine]?: ({ state: "error", error: any, unexpected: boolean, @@ -127,6 +128,9 @@ export type EngineResultsTracker = { [E in Engine]?: { unsupportedFeatures: Set, } | { state: "timeout", +}) & { + startedAt: number, + finishedAt: number, } }; export type EngineScrapeResultWithContext = { @@ -135,6 +139,16 @@ export type EngineScrapeResultWithContext = { result: (EngineScrapeResult & { markdown: string }), }; +function safeguardCircularError(error: T): T { + if (typeof error === "object" && error !== null && (error as any).results) { + const newError = structuredClone(error); + delete (newError as any).results; + return newError; + } else { + return error; + } +} + async function scrapeURLLoop( meta: Meta ): Promise { @@ -149,6 +163,7 @@ async function scrapeURLLoop( let result: EngineScrapeResultWithContext | null = null; for (const { engine, unsupportedFeatures } of fallbackList) { + const startedAt = Date.now(); try { meta.logger.info("Scraping via " + engine + "..."); const _engineResult = await scrapeURLWithEngine(meta, engine); @@ -167,6 +182,8 @@ async function scrapeURLLoop( result: engineResult, factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, unsupportedFeatures, + startedAt, + finishedAt: Date.now(), }; // NOTE: TODO: what to do when status code is bad is tough... @@ -186,23 +203,40 @@ async function scrapeURLLoop( meta.logger.info("Engine " + engine + " could not scrape the page.", { error }); results[engine] = { state: "error", - error, + error: safeguardCircularError(error), unexpected: false, + startedAt, + finishedAt: Date.now(), }; } else if (error instanceof TimeoutError) { meta.logger.info("Engine " + engine + " timed out while scraping.", { error }); results[engine] = { state: "timeout", + startedAt, + finishedAt: Date.now(), }; } else if (error instanceof AddFeatureError) { throw error; + } else if (error instanceof LLMRefusalError) { + results[engine] = { + state: "error", + error: safeguardCircularError(error), + unexpected: true, + startedAt, + finishedAt: Date.now(), + } + error.results = results; + meta.logger.warn("LLM refusal encountered", { error }); + throw error; } else { Sentry.captureException(error); meta.logger.info("An unexpected error happened while scraping with " + engine + ".", { error }); results[engine] = { state: "error", - error, + error: safeguardCircularError(error), unexpected: true, + startedAt, + finishedAt: Date.now(), } } } @@ -237,6 +271,7 @@ async function scrapeURLLoop( success: true, document, logs: meta.logs, + engines: results, }; } @@ -261,19 +296,25 @@ export async function scrapeURL( } } } catch (error) { + let results: EngineResultsTracker = {}; + if (error instanceof NoEnginesLeftError) { meta.logger.warn("scrapeURL: All scraping engines failed!", { error }); + results = error.results; } else if (error instanceof LLMRefusalError) { meta.logger.warn("scrapeURL: LLM refused to extract content", { error }); + results = error.results!; } else { Sentry.captureException(error); meta.logger.error("scrapeURL: Unexpected error happened", { error }); + // TODO: results? } return { success: false, error, logs: meta.logs, + engines: results, } } } diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 335ef2dd..e7ffc4b2 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -3,14 +3,18 @@ import { encoding_for_model } from "@dqbd/tiktoken"; import { TiktokenModel } from "@dqbd/tiktoken"; import { Document, ExtractOptions } from "../../../controllers/v1/types"; import { Logger } from "winston"; -import { Meta } from ".."; +import { EngineResultsTracker, Meta } from ".."; const maxTokens = 32000; const modifier = 4; export class LLMRefusalError extends Error { + public refusal: string; + public results: EngineResultsTracker | undefined; + constructor(refusal: string) { - super("LLM refused to extract the website's content", { cause: { refusal } }) + super("LLM refused to extract the website's content") + this.refusal = refusal; } }