Merge branch 'main' into nsc/geo-to-location

2024-11-16 03:32:22 +08:00 · 2024-10-28 20:24:29 -03:00 · 2024-10-28 20:24:29 -03:00 · 07e76f2ba5
commit 07e76f2ba5
parent b6ce49e5bb 0bad436061
11 changed files with 282 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -80,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
 - **Media parsing**: pdfs, docx, images.
 - **Reliability first**: designed to get the data you need - no matter how hard it is.
 - **Actions**: click, scroll, input, wait and more before extracting data
+- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint

 You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)

@ -350,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
    }'
 ```

+### Batch Scraping Multiple URLs (New)
+
+You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape.
+
+```bash
+curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
+    -H 'Content-Type: application/json' \
+    -H 'Authorization: Bearer YOUR_API_KEY' \
+    -d '{
+      "urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"],
+      "formats" : ["markdown", "html"]
+    }'
+```

 ### Search (v0) (Beta)

@ -483,7 +497,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
  scrapeOptions: {
    formats: ['markdown', 'html'],
  }
-} as CrawlParams, true, 30) as CrawlStatusResponse;
+} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse;

 if (crawlResponse) {
  console.log(crawlResponse)
--- a/apps/api/.env.example
+++ b/apps/api/.env.example
@ -11,6 +11,11 @@ USE_DB_AUTHENTICATION=true

 # ===== Optional ENVS ======

+# SearchApi key. Head to https://searchapi.com/ to get your API key
+SEARCHAPI_API_KEY=
+# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
+SEARCHAPI_ENGINE=
+
 # Supabase Setup (used to support DB authentication, advanced logging, etc.)
 SUPABASE_ANON_TOKEN=
 SUPABASE_URL=
--- a/apps/api/.env.local
+++ b/apps/api/.env.local
@ -12,4 +12,4 @@ ANTHROPIC_API_KEY=
 BULL_AUTH_KEY=
 LOGTAIL_KEY=
 PLAYWRIGHT_MICROSERVICE_URL=
-
+SEARCHAPI_API_KEY=
--- a/apps/api/src/controllers/v1/batch-scrape.ts
+++ b/apps/api/src/controllers/v1/batch-scrape.ts
@ -4,6 +4,7 @@ import {
  BatchScrapeRequest,
  batchScrapeRequestSchema,
  CrawlResponse,
+  legacyExtractorOptions,
  legacyScrapeOptions,
  RequestWithAuth,
 } from "./types";
@ -34,6 +35,8 @@ export async function batchScrapeController(
  }

  const pageOptions = legacyScrapeOptions(req.body);
+  const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
+

  const sc: StoredCrawl = {
    crawlerOptions: null,
@ -65,6 +68,7 @@ export async function batchScrapeController(
        plan: req.auth.plan,
        crawlerOptions: null,
        pageOptions,
+        extractorOptions,
        origin: "api",
        crawl_id: id,
        sitemapped: true,
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@ -121,8 +121,13 @@ export async function runWebScraper({
      : docs;

    if(is_scrape === false) {
-      billTeam(team_id, undefined, filteredDocs.length).catch(error => {
-        Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
+      let creditsToBeBilled = 1; // Assuming 1 credit per document
+      if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
+        creditsToBeBilled = 5;
+      }
+
+      billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
+        Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
        // Optionally, you could notify an admin or add to a retry queue here
      });
    }
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -209,14 +209,15 @@ export async function scrapSingleUrl(
            if (action.type === "click" || action.type === "write" || action.type === "press") {
              const result: Action[] = [];
              // Don't add a wait if the previous action is a wait
-              if (index === 0 || array[index - 1].type !== "wait") {
-                result.push({ type: "wait", milliseconds: 1200 } as Action);
-              }
+              // if (index === 0 || array[index - 1].type !== "wait") {
+              //   result.push({ type: "wait", milliseconds: 1200 } as Action);
+              // }
+              // Fire-engine now handles wait times automatically, leaving the code here for now
              result.push(action);
              // Don't add a wait if the next action is a wait
-              if (index === array.length - 1 || array[index + 1].type !== "wait") {
-                result.push({ type: "wait", milliseconds: 1200 } as Action);
-              }
+              // if (index === array.length - 1 || array[index + 1].type !== "wait") {
+              //   result.push({ type: "wait", milliseconds: 1200 } as Action);
+              // }
              return result;
            }
            return [action as Action];
--- a/apps/api/src/search/index.ts
+++ b/apps/api/src/search/index.ts
@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger";
 import { SearchResult } from "../../src/lib/entities";
 import { googleSearch } from "./googlesearch";
 import { fireEngineMap } from "./fireEngine";
+import { searchapi_search } from "./searchapi";
 import { serper_search } from "./serper";

 export async function search({
@ -30,7 +31,16 @@ export async function search({
  timeout?: number;
 }): Promise<SearchResult[]> {
  try {
-    
+    if (process.env.SEARCHAPI_API_KEY) {
+      return await searchapi_search(query, {
+        num_results,
+        tbs,
+        filter,
+        lang,
+        country,
+        location
+      });
+    }
    if (process.env.SERPER_API_KEY) {
      return await serper_search(query, {
        num_results,
--- a/apps/api/src/search/searchapi.ts
+++ b/apps/api/src/search/searchapi.ts
@ -0,0 +1,60 @@
+import axios from "axios";
+import dotenv from "dotenv";
+import { SearchResult } from "../../src/lib/entities";
+
+dotenv.config();
+
+interface SearchOptions {
+  tbs?: string;
+  filter?: string;
+  lang?: string;
+  country?: string;
+  location?: string;
+  num_results: number;
+  page?: number;
+}
+
+export async function searchapi_search(q: string, options: SearchOptions): Promise<SearchResult[]> {
+  const params = {
+    q: q,
+    hl: options.lang,
+    gl: options.country,
+    location: options.location,
+    num: options.num_results,
+    page: options.page ?? 1,
+    engine: process.env.SEARCHAPI_ENGINE || "google",
+  };
+
+  const url = `https://www.searchapi.io/api/v1/search`;
+
+  try {
+    const response = await axios.get(url, {
+      headers: {
+        "Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`,
+        "Content-Type": "application/json",
+        "X-SearchApi-Source": "Firecrawl",
+      },
+      params: params,
+    });
+
+
+    if (response.status === 401) {
+      throw new Error("Unauthorized. Please check your API key.");
+    }
+
+    const data = response.data;
+
+    if (data && Array.isArray(data.organic_results)) {
+      return data.organic_results.map((a: any) => ({
+        url: a.link,
+        title: a.title,
+        description: a.snippet,
+      }));
+    } else {
+      return [];
+    }
+  } catch (error) {
+    console.error(`There was an error searching for content: ${error.message}`);
+    return [];
+  }
+}
--- a/examples/claude_web_crawler/claude_web_crawler.py
+++ b/examples/claude_web_crawler/claude_web_crawler.py
@ -3,6 +3,7 @@ from firecrawl import FirecrawlApp
 import json
 from dotenv import load_dotenv
 import anthropic
+import agentops

 # ANSI color codes
 class Colors:
@ -161,4 +162,5 @@ def main():
        print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")

 if __name__ == "__main__":
+    agentops.init(os.getenv("AGENTOPS_API_KEY"))
    main()
--- a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb
+++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb
@ -98,7 +98,7 @@
   "source": [
    "# Create a cache with a 5 minute TTL\n",
    "cache = caching.CachedContent.create(\n",
-    "    model=\"models/gemini-1.5-pro-001\",\n",
+    "    model=\"models/gemini-1.5-pro-002\",\n",
    "    display_name=\"website crawl testing again\", # used to identify the cache\n",
    "    system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
    "    contents=[text_file],\n",
--- a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb
+++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb
@ -0,0 +1,166 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import datetime\n",
+    "import time\n",
+    "import google.generativeai as genai\n",
+    "from google.generativeai import caching\n",
+    "from dotenv import load_dotenv\n",
+    "from firecrawl import FirecrawlApp\n",
+    "import json\n",
+    "\n",
+    "# Load environment variables\n",
+    "load_dotenv()\n",
+    "\n",
+    "# Retrieve API keys from environment variables\n",
+    "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
+    "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
+    "\n",
+    "# Configure the Google Generative AI module with the API key\n",
+    "genai.configure(api_key=google_api_key)\n",
+    "\n",
+    "# Initialize the FirecrawlApp with your API key\n",
+    "app = FirecrawlApp(api_key=firecrawl_api_key)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No data returned from crawl.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Crawl a website\n",
+    "crawl_url = 'https://dify.ai/'\n",
+    "params = {\n",
+    "   \n",
+    "    'crawlOptions': {\n",
+    "        'limit': 100\n",
+    "    }\n",
+    "}\n",
+    "crawl_result = app.crawl_url(crawl_url, params=params)\n",
+    "\n",
+    "if crawl_result is not None:\n",
+    "    # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
+    "    cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
+    "\n",
+    "    # Save the modified results as a text file containing JSON data\n",
+    "    with open('crawl_result.txt', 'w') as file:\n",
+    "        file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
+    "else:\n",
+    "    print(\"No data returned from crawl.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Upload the video using the Files API\n",
+    "text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
+    "\n",
+    "# Wait for the file to finish processing\n",
+    "while text_file.state.name == \"PROCESSING\":\n",
+    "    print('Waiting for file to be processed.')\n",
+    "    time.sleep(2)\n",
+    "    text_file = genai.get_file(text_file.name)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a cache with a 5 minute TTL\n",
+    "cache = caching.CachedContent.create(\n",
+    "    model=\"models/gemini-1.5-flash-002\",\n",
+    "    display_name=\"website crawl testing again\", # used to identify the cache\n",
+    "    system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
+    "    contents=[text_file],\n",
+    "    ttl=datetime.timedelta(minutes=15),\n",
+    ")\n",
+    "# Construct a GenerativeModel which uses the created cache.\n",
+    "model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
+      "\n",
+      "Here's how Firecrawl helps:\n",
+      "\n",
+      "* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
+      "* **Clean Output:**  Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
+      "* **Parallel Crawling:**  Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
+      "\n",
+      "You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
+      "\n",
+      "Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Query the model\n",
+    "response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
+    "response_dict = response.to_dict()\n",
+    "response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
+    "print(response_text)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}