Merge pull request #137 from mendableai/nsc/llm-extraction-zod-integration

[Docs] Updated examples
2024-11-16 11:42:24 +08:00 · 2024-05-09 09:24:36 -07:00 · 2024-05-09 09:24:36 -07:00 · 832a4f53e0
commit 832a4f53e0
parent c02d7aeebd f4d8b2c89a
5 changed files with 232 additions and 14 deletions
--- a/apps/js-sdk/example.js
+++ b/apps/js-sdk/example.js
@ -1,7 +1,13 @@
 import FirecrawlApp from '@mendable/firecrawl-js';
+import { z } from "zod";

-const app = new FirecrawlApp({apiKey: "YOUR_API_KEY"});
+const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});

+// Scrape a website:
+const scrapeResult = await app.scrapeUrl('firecrawl.dev');
+console.log(scrapeResult.data.content)
+
+// Crawl a website:
 const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
 console.log(crawlResult)

@ -17,4 +23,61 @@ while (true) {
  await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
 }

-console.log(job.data[0].content);
+console.log(job.data[0].content);
+
+// Search for a query:
+const query = 'what is mendable?'
+const searchResult = await app.search(query)
+console.log(searchResult)
+
+// LLM Extraction:
+//  Define schema to extract contents into using zod schema
+const zodSchema = z.object({
+  top: z
+    .array(
+      z.object({
+        title: z.string(),
+        points: z.number(),
+        by: z.string(),
+        commentsURL: z.string(),
+      })
+    )
+    .length(5)
+    .describe("Top 5 stories on Hacker News"),
+});
+
+let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
+  extractorOptions: { extractionSchema: zodSchema },
+});
+
+console.log(llmExtractionResult.data.llm_extraction);
+
+// Define schema to extract contents into using json schema
+const jsonSchema = {
+  "type": "object",
+  "properties": {
+    "top": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "title": {"type": "string"},
+          "points": {"type": "number"},
+          "by": {"type": "string"},
+          "commentsURL": {"type": "string"}
+        },
+        "required": ["title", "points", "by", "commentsURL"]
+      },
+      "minItems": 5,
+      "maxItems": 5,
+      "description": "Top 5 stories on Hacker News"
+    }
+  },
+  "required": ["top"]
+}
+
+llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
+  extractorOptions: { extractionSchema: jsonSchema },
+});
+
+console.log(llmExtractionResult.data.llm_extraction);
--- a/apps/js-sdk/example.ts
+++ b/apps/js-sdk/example.ts
@ -0,0 +1,83 @@
+import FirecrawlApp, { JobStatusResponse } from '@mendable/firecrawl-js';
+import { z } from "zod";
+
+const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
+
+// Scrape a website:
+const scrapeResult = await app.scrapeUrl('firecrawl.dev');
+console.log(scrapeResult.data.content)
+
+// Crawl a website:
+const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
+console.log(crawlResult)
+
+const jobId: string = await crawlResult['jobId'];
+console.log(jobId);
+
+let job: JobStatusResponse;
+while (true) {
+  job = await app.checkCrawlStatus(jobId);
+  if (job.status === 'completed') {
+    break;
+  }
+  await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
+}
+
+console.log(job.data[0].content);
+
+// Search for a query:
+const query = 'what is mendable?'
+const searchResult = await app.search(query)
+console.log(searchResult)
+
+// LLM Extraction:
+//  Define schema to extract contents into using zod schema
+const zodSchema = z.object({
+  top: z
+    .array(
+      z.object({
+        title: z.string(),
+        points: z.number(),
+        by: z.string(),
+        commentsURL: z.string(),
+      })
+    )
+    .length(5)
+    .describe("Top 5 stories on Hacker News"),
+});
+
+let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
+  extractorOptions: { extractionSchema: zodSchema },
+});
+
+console.log(llmExtractionResult.data.llm_extraction);
+
+// Define schema to extract contents into using json schema
+const jsonSchema = {
+  "type": "object",
+  "properties": {
+    "top": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "title": {"type": "string"},
+          "points": {"type": "number"},
+          "by": {"type": "string"},
+          "commentsURL": {"type": "string"}
+        },
+        "required": ["title", "points", "by", "commentsURL"]
+      },
+      "minItems": 5,
+      "maxItems": 5,
+      "description": "Top 5 stories on Hacker News"
+    }
+  },
+  "required": ["top"]
+}
+
+llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
+  extractorOptions: { extractionSchema: jsonSchema },
+});
+
+console.log(llmExtractionResult.data.llm_extraction);
--- a/apps/js-sdk/firecrawl/README.md
+++ b/apps/js-sdk/firecrawl/README.md
@ -77,6 +77,42 @@ To scrape a single URL with error handling, use the `scrapeUrl` method. It takes
  scrapeExample();
 ```

+### Extracting structured data from a URL
+
+With LLM extraction, you can easily extract structured data from any URL. We support zod schemas to make it easier for you too. Here is how you to use it:
+
+```js
+import { z } from "zod";
+
+const zodSchema = z.object({
+  top: z
+    .array(
+      z.object({
+        title: z.string(),
+        points: z.number(),
+        by: z.string(),
+        commentsURL: z.string(),
+      })
+    )
+    .length(5)
+    .describe("Top 5 stories on Hacker News"),
+});
+
+let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
+  extractorOptions: { extractionSchema: zodSchema },
+});
+
+console.log(llmExtractionResult.data.llm_extraction);
+```
+
+### Search for a query
+
+Used to search the web, get the most relevant results, scrap each page and return the markdown.
+
+```js
+query = 'what is mendable?'
+searchResult = app.search(query)
+```

 ### Crawling a Website

--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "0.0.19",
+  "version": "0.0.20",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "build/index.js",
  "types": "types/index.d.ts",
--- a/apps/python-sdk/example.py
+++ b/apps/python-sdk/example.py
@ -1,20 +1,19 @@
 from firecrawl import FirecrawlApp

-
 app = FirecrawlApp(api_key="fc-YOUR_API_KEY")

+# Scrape a website:
+scrape_result = app.scrape_url('firecrawl.dev')
+print(scrape_result['markdown'])
+
+# Crawl a website:
 crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
+print(crawl_result)

-print(crawl_result[0]['markdown'])
-
-job_id = crawl_result['jobId']
-print(job_id)
-
-status = app.check_crawl_status(job_id)
-print(status)
-
+# LLM Extraction:
+# Define schema to extract contents into using pydantic
 from pydantic import BaseModel, Field
-from typing import List, Optional
+from typing import List

 class ArticleSchema(BaseModel):
    title: str
@ -25,7 +24,7 @@ class ArticleSchema(BaseModel):
 class TopArticlesSchema(BaseModel):
    top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")

-a = app.scrape_url('https://news.ycombinator.com', {
+llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
    'extractorOptions': {
        'extractionSchema': TopArticlesSchema.model_json_schema(),
        'mode': 'llm-extraction'
@ -35,3 +34,40 @@ a = app.scrape_url('https://news.ycombinator.com', {
    }
 })

+print(llm_extraction_result['llm_extraction'])
+
+# Define schema to extract contents into using json schema
+json_schema = {
+  "type": "object",
+  "properties": {
+    "top": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "title": {"type": "string"},
+          "points": {"type": "number"},
+          "by": {"type": "string"},
+          "commentsURL": {"type": "string"}
+        },
+        "required": ["title", "points", "by", "commentsURL"]
+      },
+      "minItems": 5,
+      "maxItems": 5,
+      "description": "Top 5 stories on Hacker News"
+    }
+  },
+  "required": ["top"]
+}
+
+llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
+    'extractorOptions': {
+        'extractionSchema': json_schema,
+        'mode': 'llm-extraction'
+    },
+    'pageOptions':{
+        'onlyMainContent': True
+    }
+})
+
+print(llm_extraction_result['llm_extraction'])