From f4d8b2c89af5fa707e1b12ba85c8b6c5ec3534e0 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 9 May 2024 10:36:56 -0300 Subject: [PATCH] Updated docs --- apps/js-sdk/example.js | 67 +++++++++++++++++++++++- apps/js-sdk/example.ts | 83 ++++++++++++++++++++++++++++++ apps/js-sdk/firecrawl/README.md | 36 +++++++++++++ apps/js-sdk/firecrawl/package.json | 2 +- apps/python-sdk/example.py | 58 +++++++++++++++++---- 5 files changed, 232 insertions(+), 14 deletions(-) create mode 100644 apps/js-sdk/example.ts diff --git a/apps/js-sdk/example.js b/apps/js-sdk/example.js index 7077b4c6..5f811925 100644 --- a/apps/js-sdk/example.js +++ b/apps/js-sdk/example.js @@ -1,7 +1,13 @@ import FirecrawlApp from '@mendable/firecrawl-js'; +import { z } from "zod"; -const app = new FirecrawlApp({apiKey: "YOUR_API_KEY"}); +const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); +// Scrape a website: +const scrapeResult = await app.scrapeUrl('firecrawl.dev'); +console.log(scrapeResult.data.content) + +// Crawl a website: const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false); console.log(crawlResult) @@ -17,4 +23,61 @@ while (true) { await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second } -console.log(job.data[0].content); \ No newline at end of file +console.log(job.data[0].content); + +// Search for a query: +const query = 'what is mendable?' +const searchResult = await app.search(query) +console.log(searchResult) + +// LLM Extraction: +// Define schema to extract contents into using zod schema +const zodSchema = z.object({ + top: z + .array( + z.object({ + title: z.string(), + points: z.number(), + by: z.string(), + commentsURL: z.string(), + }) + ) + .length(5) + .describe("Top 5 stories on Hacker News"), +}); + +let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: zodSchema }, +}); + +console.log(llmExtractionResult.data.llm_extraction); + +// Define schema to extract contents into using json schema +const jsonSchema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: jsonSchema }, +}); + +console.log(llmExtractionResult.data.llm_extraction); \ No newline at end of file diff --git a/apps/js-sdk/example.ts b/apps/js-sdk/example.ts new file mode 100644 index 00000000..9fa823ae --- /dev/null +++ b/apps/js-sdk/example.ts @@ -0,0 +1,83 @@ +import FirecrawlApp, { JobStatusResponse } from '@mendable/firecrawl-js'; +import { z } from "zod"; + +const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); + +// Scrape a website: +const scrapeResult = await app.scrapeUrl('firecrawl.dev'); +console.log(scrapeResult.data.content) + +// Crawl a website: +const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false); +console.log(crawlResult) + +const jobId: string = await crawlResult['jobId']; +console.log(jobId); + +let job: JobStatusResponse; +while (true) { + job = await app.checkCrawlStatus(jobId); + if (job.status === 'completed') { + break; + } + await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second +} + +console.log(job.data[0].content); + +// Search for a query: +const query = 'what is mendable?' +const searchResult = await app.search(query) +console.log(searchResult) + +// LLM Extraction: +// Define schema to extract contents into using zod schema +const zodSchema = z.object({ + top: z + .array( + z.object({ + title: z.string(), + points: z.number(), + by: z.string(), + commentsURL: z.string(), + }) + ) + .length(5) + .describe("Top 5 stories on Hacker News"), +}); + +let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: zodSchema }, +}); + +console.log(llmExtractionResult.data.llm_extraction); + +// Define schema to extract contents into using json schema +const jsonSchema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: jsonSchema }, +}); + +console.log(llmExtractionResult.data.llm_extraction); \ No newline at end of file diff --git a/apps/js-sdk/firecrawl/README.md b/apps/js-sdk/firecrawl/README.md index 3f92c323..085e865b 100644 --- a/apps/js-sdk/firecrawl/README.md +++ b/apps/js-sdk/firecrawl/README.md @@ -77,6 +77,42 @@ To scrape a single URL with error handling, use the `scrapeUrl` method. It takes scrapeExample(); ``` +### Extracting structured data from a URL + +With LLM extraction, you can easily extract structured data from any URL. We support zod schemas to make it easier for you too. Here is how you to use it: + +```js +import { z } from "zod"; + +const zodSchema = z.object({ + top: z + .array( + z.object({ + title: z.string(), + points: z.number(), + by: z.string(), + commentsURL: z.string(), + }) + ) + .length(5) + .describe("Top 5 stories on Hacker News"), +}); + +let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", { + extractorOptions: { extractionSchema: zodSchema }, +}); + +console.log(llmExtractionResult.data.llm_extraction); +``` + +### Search for a query + +Used to search the web, get the most relevant results, scrap each page and return the markdown. + +```js +query = 'what is mendable?' +searchResult = app.search(query) +``` ### Crawling a Website diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index a9359cfb..9e1948a6 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.19", + "version": "0.0.20", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "types": "types/index.d.ts", diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index a2e01739..d83be6de 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -1,20 +1,19 @@ from firecrawl import FirecrawlApp - app = FirecrawlApp(api_key="fc-YOUR_API_KEY") +# Scrape a website: +scrape_result = app.scrape_url('firecrawl.dev') +print(scrape_result['markdown']) + +# Crawl a website: crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}) +print(crawl_result) -print(crawl_result[0]['markdown']) - -job_id = crawl_result['jobId'] -print(job_id) - -status = app.check_crawl_status(job_id) -print(status) - +# LLM Extraction: +# Define schema to extract contents into using pydantic from pydantic import BaseModel, Field -from typing import List, Optional +from typing import List class ArticleSchema(BaseModel): title: str @@ -25,7 +24,7 @@ class ArticleSchema(BaseModel): class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") -a = app.scrape_url('https://news.ycombinator.com', { +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { 'extractorOptions': { 'extractionSchema': TopArticlesSchema.model_json_schema(), 'mode': 'llm-extraction' @@ -35,3 +34,40 @@ a = app.scrape_url('https://news.ycombinator.com', { } }) +print(llm_extraction_result['llm_extraction']) + +# Define schema to extract contents into using json schema +json_schema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': json_schema, + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) + +print(llm_extraction_result['llm_extraction']) \ No newline at end of file