Caleb: got it to a testable state I believe

2024-11-16 11:42:24 +08:00 · 2024-04-28 15:52:09 -07:00 · 2024-04-28 15:52:09 -07:00 · 06497729e2
commit 06497729e2
parent 6ee1f2d3bc
7 changed files with 163 additions and 31 deletions
--- a/apps/api/src/tests/e2e_withAuth/index.test.ts
+++ b/apps/api/src/tests/e2e_withAuth/index.test.ts
@ -8,7 +8,7 @@ dotenv.config();
 const TEST_URL = "http://127.0.0.1:3002";


-  describe("E2E Tests for API Routes", () => {
+  describe.only("E2E Tests for API Routes", () => {
    beforeAll(() => {
      process.env.USE_DB_AUTHENTICATION = "true";
    });
@ -252,6 +252,48 @@ const TEST_URL = "http://127.0.0.1:3002";
      }, 60000); // 60 seconds
    });

+    describe("POST /v0/scrape with LLM Extraction", () => {
+      it("should extract data using LLM extraction mode", async () => {
+        const response = await request(TEST_URL)
+          .post("/v0/scrape")
+          .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+          .set("Content-Type", "application/json")
+          .send({
+            url: "https://mendable.ai",
+            pageOptions: {
+              onlyMainContent: true
+            },
+            extractorOptions: {
+              extractorMode: "llm-extract",
+              extractor_prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
+              extractorSchema: {
+                type: "object",
+                properties: {
+                  company_mission: {
+                    type: "string"
+                  },
+                  supports_sso: {
+                    type: "boolean"
+                  },
+                  is_open_source: {
+                    type: "boolean"
+                  }
+                },
+                required: ["company_mission", "supports_sso", "is_open_source"]
+              }
+            }
+          });
+  
+        console.log("Response:", response.body);
+  
+        expect(response.statusCode).toBe(200);
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data).toHaveProperty("company_mission");
+        expect(response.body.data).toHaveProperty("supports_sso");
+        expect(response.body.data).toHaveProperty("is_open_source");
+      });
+    });
+
    describe("GET /is-production", () => {
      it("should return the production status", async () => {
        const response = await request(TEST_URL).get("/is-production");
--- a/apps/api/src/controllers/scrape.ts
+++ b/apps/api/src/controllers/scrape.ts
@ -1,3 +1,4 @@
+import { ExtractorOptions } from './../lib/entities';
 import { Request, Response } from "express";
 import { WebScraperDataProvider } from "../scraper/WebScraper";
 import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
@ -11,7 +12,8 @@ export async function scrapeHelper(
  req: Request,
  team_id: string,
  crawlerOptions: any,
-  pageOptions: any
+  pageOptions: any,
+  extractorOptions: any
 ): Promise<{
  success: boolean;
  error?: string;
@ -35,6 +37,7 @@ export async function scrapeHelper(
      ...crawlerOptions,
    },
    pageOptions: pageOptions,
+    extractorOptions: extractorOptions
  });

  const docs = await a.getDocuments(false);
@ -79,6 +82,9 @@ export async function scrapeController(req: Request, res: Response) {
    }
    const crawlerOptions = req.body.crawlerOptions ?? {};
    const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
+    const extractorOptions = req.body.extractorOptions ?? {
+      mode: "markdown"
+    }
    const origin = req.body.origin ?? "api";

    try {
@ -96,7 +102,8 @@ export async function scrapeController(req: Request, res: Response) {
      req,
      team_id,
      crawlerOptions,
-      pageOptions
+      pageOptions,
+      extractorOptions
    );
    const endTime = new Date().getTime();
    const timeTakenInSeconds = (endTime - startTime) / 1000;
--- a/apps/api/src/lib/LLM-extraction/index.ts
+++ b/apps/api/src/lib/LLM-extraction/index.ts
@ -0,0 +1,48 @@
+import Turndown from 'turndown'
+import OpenAI from 'openai'
+// import { LlamaModel } from 'node-llama-cpp'
+import { z } from 'zod'
+import { zodToJsonSchema } from 'zod-to-json-schema'
+import {
+    ScraperCompletionResult,
+    generateOpenAICompletions,
+} from './models.js'
+import { ExtractorOptions } from '../entities.js'
+
+  // Generate completion using OpenAI
+export function generateCompletions(
+    documents: Document[],
+    extractionOptions: ExtractorOptions
+): Promise < ScraperCompletionResult < T >> [] {
+    // const schema = zodToJsonSchema(options.schema)
+
+    const schema = extractionOptions.extractionSchema;
+    const prompt = extractionOptions.extractionPrompt;
+
+    const loader = documents.map(async (document, i) => {
+        switch (this.client.constructor) {
+            case true:
+                return generateOpenAICompletions<T>(
+                    this.client as OpenAI,
+                
+                    schema,
+                    options?.prompt,
+                    options?.temperature
+                )
+            
+            //TODO add other models
+            // case LlamaModel:
+            //     return generateLlamaCompletions<T>(
+            //         this.client,
+            //         await page,
+            //         schema,
+            //         options?.prompt,
+            //         options?.temperature
+            //     )
+            default:
+                throw new Error('Invalid client')
+        }
+    })
+
+    return loader
+}
--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@ -1,6 +1,8 @@
 import OpenAI from 'openai'
 import { z } from 'zod'
 import { ScraperLoadResult } from './types'
+import { Document, ExtractorOptions } from "../../lib/entities";
+
 // import {
 //   LlamaModel,
 //   LlamaJsonSchemaGrammar,
@ -8,41 +10,45 @@ import { ScraperLoadResult } from './types'
 //   LlamaChatSession,
 //   GbnfJsonSchema,
 // } from 'node-llama-cpp'
-import { JsonSchema7Type } from 'zod-to-json-schema'
+// import { JsonSchema7Type } from 'zod-to-json-schema'

 export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
-  data: z.infer<T> | null
+  data: any | null
  url: string
 }

 const defaultPrompt =
  'You are a satistified web scraper. Extract the contents of the webpage'

-function prepareOpenAIPage(
-  page: ScraperLoadResult
+function prepareOpenAIDoc(
+  document: Document
 ): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
-  if (page.mode === 'image') {
-    return [
-      {
-        type: 'image_url',
-        image_url: { url: `data:image/jpeg;base64,${page.content}` },
-      },
-    ]
+
+  // Check if the markdown content exists in the document
+  if (!document.markdown) {
+    throw new Error("Markdown content is missing in the document.");
  }

-  return [{ type: 'text', text: page.content }]
+  return [{ type: 'text', text: document.markdown }]
 }

-export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
+export async function generateOpenAICompletions<T>({
+  client,
+  model = 'gpt-3.5-turbo',
+  document,
+  schema, //TODO - add zod dynamic type checking
+  prompt = defaultPrompt,
+  temperature
+}: {
  client: OpenAI,
-  model: string = 'gpt-3.5-turbo',
-  page: ScraperLoadResult,
-  schema: JsonSchema7Type,
-  prompt: string = defaultPrompt,
+  model?: string,
+  document: Document,
+  schema: any, // This should be replaced with a proper Zod schema type when available
+  prompt?: string,
  temperature?: number
-): Promise<ScraperCompletionResult<T>> {
+}): Promise<Document> {
  const openai = client as OpenAI
-  const content = prepareOpenAIPage(page)
+  const content = prepareOpenAIDoc(document)

  const completion = await openai.chat.completions.create({
    model,
@ -68,10 +74,16 @@ export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
  })

  const c = completion.choices[0].message.tool_calls[0].function.arguments
+  
+  // Extract the LLM extraction content from the completion response
+  const llmExtraction = c;
+
+  // Return the document with the LLM extraction content added
  return {
-    data: JSON.parse(c),
-    url: page.url,
-  }
+    ...document,
+    llm_extraction: llmExtraction
+  };
+   
 }

 // export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
--- a/apps/api/src/lib/LLM-extraction/types.ts
+++ b/apps/api/src/lib/LLM-extraction/types.ts
@ -3,8 +3,3 @@ export type ScraperLoadOptions = {
    closeOnFinish?: boolean
 }

-export type ScraperLoadResult = {
-    url: string
-    content: string
-    mode: ScraperLoadOptions['mode']
-}
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -16,6 +16,12 @@ export type PageOptions = {
  
 };

+export type ExtractorOptions = {
+  mode: "markdown" | "llm-extraction";
+  extractionPrompt?: string;
+  extractionSchema?: Record<string, any>;
+}
+
 export type SearchOptions = {
  limit?: number;
  tbs?: string;
@ -38,6 +44,7 @@ export type WebScraperOptions = {
    replaceAllPathsWithAbsolutePaths?: boolean;
  };
  pageOptions?: PageOptions;
+  extractorOptions?: ExtractorOptions;
  concurrentRequests?: number;
 };

@ -50,6 +57,7 @@ export class Document {
  url?: string; // Used only in /search for now
  content: string;
  markdown?: string;
+  llm_extraction?: string;
  createdAt?: Date;
  updatedAt?: Date;
  type?: string;
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -1,4 +1,4 @@
-import { Document, PageOptions, WebScraperOptions } from "../../lib/entities";
+import { Document, ExtractorOptions, PageOptions, WebScraperOptions } from "../../lib/entities";
 import { Progress } from "../../lib/entities";
 import { scrapSingleUrl } from "./single_url";
 import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
@ -7,6 +7,8 @@ import { getValue, setValue } from "../../services/redis";
 import { getImageDescription } from "./utils/imageDescription";
 import { fetchAndProcessPdf } from "./utils/pdfProcessor";
 import { replaceImgPathsWithAbsolutePaths, replacePathsWithAbsolutePaths } from "./utils/replacePaths";
+import OpenAI from 'openai'
+

 export class WebScraperDataProvider {
  private urls: string[] = [""];
@ -19,6 +21,7 @@ export class WebScraperDataProvider {
  private concurrentRequests: number = 20;
  private generateImgAltText: boolean = false;
  private pageOptions?: PageOptions;
+  private extractorOptions?: ExtractorOptions;
  private replaceAllPathsWithAbsolutePaths?: boolean = false;
  private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" = "gpt-4-turbo";

@ -191,6 +194,22 @@ export class WebScraperDataProvider {
        documents = await this.getSitemapData(baseUrl, documents);
        documents = documents.concat(pdfDocuments);

+
+     
+
+        if(this.extractorOptions.mode === "llm-extraction") {
+
+          // const llm = new OpenAI()
+          // generateCompletions(
+          //   client=llm,
+          //   page =, 
+          //   schema= 
+            
+          // )
+            
+
+        }
+
        await this.setCachedDocuments(documents);
        documents = this.removeChildLinks(documents);
        documents = documents.splice(0, this.limit);
@ -376,6 +395,7 @@ export class WebScraperDataProvider {
    this.generateImgAltText =
      options.crawlerOptions?.generateImgAltText ?? false;
    this.pageOptions = options.pageOptions ?? {onlyMainContent: false};
+    this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
    this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;

    //! @nicolas, for some reason this was being injected and breakign everything. Don't have time to find source of the issue so adding this check