Caleb: initially pulled inspiration code from https://github.com/mishushakov/llm-scraper

2024-11-16 11:42:24 +08:00 · 2024-04-28 13:59:35 -07:00 · 2024-04-28 13:59:35 -07:00 · 6ee1f2d3bc
commit 6ee1f2d3bc
parent e6d7a4761d
5 changed files with 145 additions and 13 deletions
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -68,6 +68,7 @@
    "gpt3-tokenizer": "^1.1.5",
    "ioredis": "^5.3.2",
    "joplin-turndown-plugin-gfm": "^1.0.12",
+    "json-schema-to-zod": "^2.1.0",
    "keyword-extractor": "^0.0.25",
    "langchain": "^0.1.25",
    "languagedetect": "^2.0.0",
@ -93,7 +94,9 @@
    "unstructured-client": "^0.9.4",
    "uuid": "^9.0.1",
    "wordpos": "^2.1.0",
-    "xml2js": "^0.6.2"
+    "xml2js": "^0.6.2",
+    "zod": "^3.23.4",
+    "zod-to-json-schema": "^3.23.0"
  },
  "nodemonConfig": {
    "ignore": [
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
@ -86,6 +86,9 @@ dependencies:
  joplin-turndown-plugin-gfm:
    specifier: ^1.0.12
    version: 1.0.12
+  json-schema-to-zod:
+    specifier: ^2.1.0
+    version: 2.1.0
  keyword-extractor:
    specifier: ^0.0.25
    version: 0.0.25
@ -164,6 +167,12 @@ dependencies:
  xml2js:
    specifier: ^0.6.2
    version: 0.6.2
+  zod:
+    specifier: ^3.23.4
+    version: 3.23.4
+  zod-to-json-schema:
+    specifier: ^3.23.0
+    version: 3.23.0(zod@3.23.4)

 devDependencies:
  '@flydotio/dockerfile':
@ -1200,7 +1209,7 @@ packages:
      redis: 4.6.13
      typesense: 1.7.2(@babel/runtime@7.24.0)
      uuid: 9.0.1
-      zod: 3.22.4
+      zod: 3.23.4
    transitivePeerDependencies:
      - encoding
    dev: false
@ -1218,8 +1227,8 @@ packages:
      p-queue: 6.6.2
      p-retry: 4.6.2
      uuid: 9.0.1
-      zod: 3.22.4
-      zod-to-json-schema: 3.22.4(zod@3.22.4)
+      zod: 3.23.4
+      zod-to-json-schema: 3.23.0(zod@3.23.4)
    dev: false

  /@langchain/openai@0.0.18:
@ -1229,8 +1238,8 @@ packages:
      '@langchain/core': 0.1.43
      js-tiktoken: 1.0.10
      openai: 4.28.4
-      zod: 3.22.4
-      zod-to-json-schema: 3.22.4(zod@3.22.4)
+      zod: 3.23.4
+      zod-to-json-schema: 3.23.0(zod@3.23.4)
    transitivePeerDependencies:
      - encoding
    dev: false
@ -3985,6 +3994,11 @@ packages:
  /json-parse-even-better-errors@2.3.1:
    resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==}

+  /json-schema-to-zod@2.1.0:
+    resolution: {integrity: sha512-7ishNgYY+AbIKeeHcp5xCOdJbdVwSfDx/4V2ktc16LUusCJJbz2fEKdWUmAxhKIiYzhZ9Fp4E8OsAoM/h9cOLA==}
+    hasBin: true
+    dev: false
+
  /json5@2.2.3:
    resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==}
    engines: {node: '>=6'}
@ -4209,8 +4223,8 @@ packages:
      redis: 4.6.13
      uuid: 9.0.1
      yaml: 2.4.1
-      zod: 3.22.4
-      zod-to-json-schema: 3.22.4(zod@3.22.4)
+      zod: 3.23.4
+      zod-to-json-schema: 3.23.0(zod@3.23.4)
    transitivePeerDependencies:
      - '@aws-crypto/sha256-js'
      - '@aws-sdk/client-bedrock-agent-runtime'
@ -5069,7 +5083,7 @@ packages:
      sbd: 1.0.19
      typescript: 5.4.5
      uuid: 9.0.1
-      zod: 3.22.4
+      zod: 3.23.4
    transitivePeerDependencies:
      - debug
    dev: false
@ -6185,14 +6199,18 @@ packages:
    engines: {node: '>=10'}
    dev: true

-  /zod-to-json-schema@3.22.4(zod@3.22.4):
-    resolution: {integrity: sha512-2Ed5dJ+n/O3cU383xSY28cuVi0BCQhF8nYqWU5paEpl7fVdqdAmiLdqLyfblbNdfOFwFfi/mqU4O1pwc60iBhQ==}
+  /zod-to-json-schema@3.23.0(zod@3.23.4):
+    resolution: {integrity: sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==}
    peerDependencies:
-      zod: ^3.22.4
+      zod: ^3.23.3
    dependencies:
-      zod: 3.22.4
+      zod: 3.23.4
    dev: false

  /zod@3.22.4:
    resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
    dev: false
+
+  /zod@3.23.4:
+    resolution: {integrity: sha512-/AtWOKbBgjzEYYQRNfoGKHObgfAZag6qUJX1VbHo2PRBgS+wfWagEY2mizjfyAPcGesrJOcx/wcl0L9WnVrHFw==}
+    dev: false
--- a/apps/api/src/lib/LLM-extraction/models.ts
+++ b/apps/api/src/lib/LLM-extraction/models.ts
@ -0,0 +1,99 @@
+import OpenAI from 'openai'
+import { z } from 'zod'
+import { ScraperLoadResult } from './types'
+// import {
+//   LlamaModel,
+//   LlamaJsonSchemaGrammar,
+//   LlamaContext,
+//   LlamaChatSession,
+//   GbnfJsonSchema,
+// } from 'node-llama-cpp'
+import { JsonSchema7Type } from 'zod-to-json-schema'
+
+export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
+  data: z.infer<T> | null
+  url: string
+}
+
+const defaultPrompt =
+  'You are a satistified web scraper. Extract the contents of the webpage'
+
+function prepareOpenAIPage(
+  page: ScraperLoadResult
+): OpenAI.Chat.Completions.ChatCompletionContentPart[] {
+  if (page.mode === 'image') {
+    return [
+      {
+        type: 'image_url',
+        image_url: { url: `data:image/jpeg;base64,${page.content}` },
+      },
+    ]
+  }
+
+  return [{ type: 'text', text: page.content }]
+}
+
+export async function generateOpenAICompletions<T extends z.ZodSchema<any>>(
+  client: OpenAI,
+  model: string = 'gpt-3.5-turbo',
+  page: ScraperLoadResult,
+  schema: JsonSchema7Type,
+  prompt: string = defaultPrompt,
+  temperature?: number
+): Promise<ScraperCompletionResult<T>> {
+  const openai = client as OpenAI
+  const content = prepareOpenAIPage(page)
+
+  const completion = await openai.chat.completions.create({
+    model,
+    messages: [
+      {
+        role: 'system',
+        content: prompt,
+      },
+      { role: 'user', content },
+    ],
+    tools: [
+      {
+        type: 'function',
+        function: {
+          name: 'extract_content',
+          description: 'Extracts the content from the given webpage(s)',
+          parameters: schema,
+        },
+      },
+    ],
+    tool_choice: 'auto',
+    temperature,
+  })
+
+  const c = completion.choices[0].message.tool_calls[0].function.arguments
+  return {
+    data: JSON.parse(c),
+    url: page.url,
+  }
+}
+
+// export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
+//   model: LlamaModel,
+//   page: ScraperLoadResult,
+//   schema: JsonSchema7Type,
+//   prompt: string = defaultPrompt,
+//   temperature?: number
+// ): Promise<ScraperCompletionResult<T>> {
+//   const grammar = new LlamaJsonSchemaGrammar(schema as GbnfJsonSchema) as any // any, because it has weird type inference going on
+//   const context = new LlamaContext({ model })
+//   const session = new LlamaChatSession({ context })
+//   const pagePrompt = `${prompt}\n${page.content}`
+
+//   const result = await session.prompt(pagePrompt, {
+//     grammar,
+//     temperature,
+//   })
+
+//   const parsed = grammar.parse(result)
+//   return {
+//     data: parsed,
+//     url: page.url,
+//   }
+// }
--- a/apps/api/src/lib/LLM-extraction/types.ts
+++ b/apps/api/src/lib/LLM-extraction/types.ts
@ -0,0 +1,10 @@
+export type ScraperLoadOptions = {
+    mode?: 'html' | 'text' | 'markdown' | 'image'
+    closeOnFinish?: boolean
+}
+
+export type ScraperLoadResult = {
+    url: string
+    content: string
+    mode: ScraperLoadOptions['mode']
+}
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -140,6 +140,8 @@ export async function scrapSingleUrl(
        }
        break;
    }
+
+    //* TODO: add an optional to return markdown or structured/extracted content 
    let cleanedHtml = removeUnwantedElements(text, pageOptions);
    
    return [await parseMarkdown(cleanedHtml), text];