Caleb: got it to a testable state I believe

This commit is contained in:
Caleb Peffer 2024-04-28 15:52:09 -07:00
parent 6ee1f2d3bc
commit 06497729e2
7 changed files with 163 additions and 31 deletions

View File

@ -8,7 +8,7 @@ dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for API Routes", () => {
describe.only("E2E Tests for API Routes", () => {
beforeAll(() => {
process.env.USE_DB_AUTHENTICATION = "true";
});
@ -252,6 +252,48 @@ const TEST_URL = "http://127.0.0.1:3002";
}, 60000); // 60 seconds
});
describe("POST /v0/scrape with LLM Extraction", () => {
it("should extract data using LLM extraction mode", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
pageOptions: {
onlyMainContent: true
},
extractorOptions: {
extractorMode: "llm-extract",
extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
extractorSchema: {
type: "object",
properties: {
company_mission: {
type: "string"
},
supports_sso: {
type: "boolean"
},
is_open_source: {
type: "boolean"
}
},
required: ["company_mission", "supports_sso", "is_open_source"]
}
}
});
console.log("Response:", response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("company_mission");
expect(response.body.data).toHaveProperty("supports_sso");
expect(response.body.data).toHaveProperty("is_open_source");
});
});
describe("GET /is-production", () => {
it("should return the production status", async () => {
const response = await request(TEST_URL).get("/is-production");

View File

@ -1,3 +1,4 @@
import { ExtractorOptions } from './../lib/entities';
import { Request, Response } from "express";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
@ -11,7 +12,8 @@ export async function scrapeHelper(
req: Request,
team_id: string,
crawlerOptions: any,
pageOptions: any
pageOptions: any,
extractorOptions: any
): Promise<{
success: boolean;
error?: string;
@ -35,6 +37,7 @@ export async function scrapeHelper(
...crawlerOptions,
},
pageOptions: pageOptions,
extractorOptions: extractorOptions
});
const docs = await a.getDocuments(false);
@ -79,6 +82,9 @@ export async function scrapeController(req: Request, res: Response) {
}
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
const extractorOptions = req.body.extractorOptions ?? {
mode: "markdown"
}
const origin = req.body.origin ?? "api";
try {
@ -96,7 +102,8 @@ export async function scrapeController(req: Request, res: Response) {
req,
team_id,
crawlerOptions,
pageOptions
pageOptions,
extractorOptions
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;

View File

@ -0,0 +1,48 @@
import Turndown from 'turndown'
import OpenAI from 'openai'
// import { LlamaModel } from 'node-llama-cpp'
import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema'
import {
ScraperCompletionResult,
generateOpenAICompletions,
} from './models.js'
import { ExtractorOptions } from '../entities.js'
// Generate completion using OpenAI
export function generateCompletions(
documents: Document[],
extractionOptions: ExtractorOptions
): Promise < ScraperCompletionResult < T >> [] {
// const schema = zodToJsonSchema(options.schema)
const schema = extractionOptions.extractionSchema;
const prompt = extractionOptions.extractionPrompt;
const loader = documents.map(async (document, i) => {
switch (this.client.constructor) {
case true:
return generateOpenAICompletions<T>(
this.client as OpenAI,
schema,
options?.prompt,
options?.temperature
)
//TODO add other models
// case LlamaModel:
// return generateLlamaCompletions<T>(
// this.client,
// await page,
// schema,
// options?.prompt,
// options?.temperature
// )
default:
throw new Error('Invalid client')
}
})
return loader
}

View File

@ -1,6 +1,8 @@
import OpenAI from 'openai'
import { z } from 'zod'
import { ScraperLoadResult } from './types'
import { Document, ExtractorOptions } from "../../lib/entities";
// import {
// LlamaModel,
// LlamaJsonSchemaGrammar,
@ -8,41 +10,45 @@ import { ScraperLoadResult } from './types'
// LlamaChatSession,
// GbnfJsonSchema,
// } from 'node-llama-cpp'
import { JsonSchema7Type } from 'zod-to-json-schema'
// import { JsonSchema7Type } from 'zod-to-json-schema'
export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
data: z.infer<T> | null
data: any | null
url: string
}
const defaultPrompt =
'You are a satistified web scraper. Extract the contents of the webpage'
function prepareOpenAIPage(
page: ScraperLoadResult
function prepareOpenAIDoc(
document: Document
): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
if (page.mode === 'image') {
return [
{
type: 'image_url',
image_url: { url: `data:image/jpeg;base64,${page.content}` },
},
]
// Check if the markdown content exists in the document
if (!document.markdown) {
throw new Error("Markdown content is missing in the document.");
}
return [{ type: 'text', text: page.content }]
return [{ type: 'text', text: document.markdown }]
}
export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
export async function generateOpenAICompletions<T>({
client,
model = 'gpt-3.5-turbo',
document,
schema, //TODO - add zod dynamic type checking
prompt = defaultPrompt,
temperature
}: {
client: OpenAI,
model: string = 'gpt-3.5-turbo',
page: ScraperLoadResult,
schema: JsonSchema7Type,
prompt: string = defaultPrompt,
model?: string,
document: Document,
schema: any, // This should be replaced with a proper Zod schema type when available
prompt?: string,
temperature?: number
): Promise<ScraperCompletionResult<T>> {
}): Promise<Document> {
const openai = client as OpenAI
const content = prepareOpenAIPage(page)
const content = prepareOpenAIDoc(document)
const completion = await openai.chat.completions.create({
model,
@ -68,10 +74,16 @@ export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
})
const c = completion.choices[0].message.tool_calls[0].function.arguments
// Extract the LLM extraction content from the completion response
const llmExtraction = c;
// Return the document with the LLM extraction content added
return {
data: JSON.parse(c),
url: page.url,
}
...document,
llm_extraction: llmExtraction
};
}
// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(

View File

@ -3,8 +3,3 @@ export type ScraperLoadOptions = {
closeOnFinish?: boolean
}
export type ScraperLoadResult = {
url: string
content: string
mode: ScraperLoadOptions['mode']
}

View File

@ -16,6 +16,12 @@ export type PageOptions = {
};
export type ExtractorOptions = {
mode: "markdown" | "llm-extraction";
extractionPrompt?: string;
extractionSchema?: Record<string, any>;
}
export type SearchOptions = {
limit?: number;
tbs?: string;
@ -38,6 +44,7 @@ export type WebScraperOptions = {
replaceAllPathsWithAbsolutePaths?: boolean;
};
pageOptions?: PageOptions;
extractorOptions?: ExtractorOptions;
concurrentRequests?: number;
};
@ -50,6 +57,7 @@ export class Document {
url?: string; // Used only in /search for now
content: string;
markdown?: string;
llm_extraction?: string;
createdAt?: Date;
updatedAt?: Date;
type?: string;

View File

@ -1,4 +1,4 @@
import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
import { Progress } from "../../lib/entities";
import { scrapSingleUrl } from "./single_url";
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
@ -7,6 +7,8 @@ import { getValue, setValue } from "../../services/redis";
import { getImageDescription } from "./utils/imageDescription";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
import OpenAI from 'openai'
export class WebScraperDataProvider {
private urls: string[] = [""];
@ -19,6 +21,7 @@ export class WebScraperDataProvider {
private concurrentRequests: number = 20;
private generateImgAltText: boolean = false;
private pageOptions?: PageOptions;
private extractorOptions?: ExtractorOptions;
private replaceAllPathsWithAbsolutePaths?: boolean = false;
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";
@ -191,6 +194,22 @@ export class WebScraperDataProvider {
documents = await this.getSitemapData(baseUrl, documents);
documents = documents.concat(pdfDocuments);
if(this.extractorOptions.mode === "llm-extraction") {
// const llm = new OpenAI()
// generateCompletions(
// client=llm,
// page =,
// schema=
// )
}
await this.setCachedDocuments(documents);
documents = this.removeChildLinks(documents);
documents = documents.splice(0, this.limit);
@ -376,6 +395,7 @@ export class WebScraperDataProvider {
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
//! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check