Merge branch 'main' into nsc/geo-to-location

This commit is contained in:
Nicolas 2024-10-28 20:24:29 -03:00
commit 07e76f2ba5
11 changed files with 282 additions and 15 deletions

View File

@ -80,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
- **Media parsing**: pdfs, docx, images.
- **Reliability first**: designed to get the data you need - no matter how hard it is.
- **Actions**: click, scroll, input, wait and more before extracting data
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
@ -350,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
}'
```
### Batch Scraping Multiple URLs (New)
You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape.
```bash
curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"],
"formats" : ["markdown", "html"]
}'
```
### Search (v0) (Beta)
@ -483,7 +497,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
scrapeOptions: {
formats: ['markdown', 'html'],
}
} as CrawlParams, true, 30) as CrawlStatusResponse;
} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse;
if (crawlResponse) {
console.log(crawlResponse)

View File

@ -11,6 +11,11 @@ USE_DB_AUTHENTICATION=true
# ===== Optional ENVS ======
# SearchApi key. Head to https://searchapi.com/ to get your API key
SEARCHAPI_API_KEY=
# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
SEARCHAPI_ENGINE=
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
SUPABASE_ANON_TOKEN=
SUPABASE_URL=

View File

@ -12,4 +12,4 @@ ANTHROPIC_API_KEY=
BULL_AUTH_KEY=
LOGTAIL_KEY=
PLAYWRIGHT_MICROSERVICE_URL=
SEARCHAPI_API_KEY=

View File

@ -4,6 +4,7 @@ import {
BatchScrapeRequest,
batchScrapeRequestSchema,
CrawlResponse,
legacyExtractorOptions,
legacyScrapeOptions,
RequestWithAuth,
} from "./types";
@ -34,6 +35,8 @@ export async function batchScrapeController(
}
const pageOptions = legacyScrapeOptions(req.body);
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
const sc: StoredCrawl = {
crawlerOptions: null,
@ -65,6 +68,7 @@ export async function batchScrapeController(
plan: req.auth.plan,
crawlerOptions: null,
pageOptions,
extractorOptions,
origin: "api",
crawl_id: id,
sitemapped: true,

View File

@ -121,8 +121,13 @@ export async function runWebScraper({
: docs;
if(is_scrape === false) {
billTeam(team_id, undefined, filteredDocs.length).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
let creditsToBeBilled = 1; // Assuming 1 credit per document
if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
creditsToBeBilled = 5;
}
billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
}

View File

@ -209,14 +209,15 @@ export async function scrapSingleUrl(
if (action.type === "click" || action.type === "write" || action.type === "press") {
const result: Action[] = [];
// Don't add a wait if the previous action is a wait
if (index === 0 || array[index - 1].type !== "wait") {
result.push({ type: "wait", milliseconds: 1200 } as Action);
}
// if (index === 0 || array[index - 1].type !== "wait") {
// result.push({ type: "wait", milliseconds: 1200 } as Action);
// }
// Fire-engine now handles wait times automatically, leaving the code here for now
result.push(action);
// Don't add a wait if the next action is a wait
if (index === array.length - 1 || array[index + 1].type !== "wait") {
result.push({ type: "wait", milliseconds: 1200 } as Action);
}
// if (index === array.length - 1 || array[index + 1].type !== "wait") {
// result.push({ type: "wait", milliseconds: 1200 } as Action);
// }
return result;
}
return [action as Action];

View File

@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger";
import { SearchResult } from "../../src/lib/entities";
import { googleSearch } from "./googlesearch";
import { fireEngineMap } from "./fireEngine";
import { searchapi_search } from "./searchapi";
import { serper_search } from "./serper";
export async function search({
@ -30,7 +31,16 @@ export async function search({
timeout?: number;
}): Promise<SearchResult[]> {
try {
if (process.env.SEARCHAPI_API_KEY) {
return await searchapi_search(query, {
num_results,
tbs,
filter,
lang,
country,
location
});
}
if (process.env.SERPER_API_KEY) {
return await serper_search(query, {
num_results,

View File

@ -0,0 +1,60 @@
import axios from "axios";
import dotenv from "dotenv";
import { SearchResult } from "../../src/lib/entities";
dotenv.config();
interface SearchOptions {
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
num_results: number;
page?: number;
}
export async function searchapi_search(q: string, options: SearchOptions): Promise<SearchResult[]> {
const params = {
q: q,
hl: options.lang,
gl: options.country,
location: options.location,
num: options.num_results,
page: options.page ?? 1,
engine: process.env.SEARCHAPI_ENGINE || "google",
};
const url = `https://www.searchapi.io/api/v1/search`;
try {
const response = await axios.get(url, {
headers: {
"Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`,
"Content-Type": "application/json",
"X-SearchApi-Source": "Firecrawl",
},
params: params,
});
if (response.status === 401) {
throw new Error("Unauthorized. Please check your API key.");
}
const data = response.data;
if (data && Array.isArray(data.organic_results)) {
return data.organic_results.map((a: any) => ({
url: a.link,
title: a.title,
description: a.snippet,
}));
} else {
return [];
}
} catch (error) {
console.error(`There was an error searching for content: ${error.message}`);
return [];
}
}

View File

@ -3,6 +3,7 @@ from firecrawl import FirecrawlApp
import json
from dotenv import load_dotenv
import anthropic
import agentops
# ANSI color codes
class Colors:
@ -161,4 +162,5 @@ def main():
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
if __name__ == "__main__":
agentops.init(os.getenv("AGENTOPS_API_KEY"))
main()

View File

@ -98,7 +98,7 @@
"source": [
"# Create a cache with a 5 minute TTL\n",
"cache = caching.CachedContent.create(\n",
" model=\"models/gemini-1.5-pro-001\",\n",
" model=\"models/gemini-1.5-pro-002\",\n",
" display_name=\"website crawl testing again\", # used to identify the cache\n",
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
" contents=[text_file],\n",

View File

@ -0,0 +1,166 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import os\n",
"import datetime\n",
"import time\n",
"import google.generativeai as genai\n",
"from google.generativeai import caching\n",
"from dotenv import load_dotenv\n",
"from firecrawl import FirecrawlApp\n",
"import json\n",
"\n",
"# Load environment variables\n",
"load_dotenv()\n",
"\n",
"# Retrieve API keys from environment variables\n",
"google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
"firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
"\n",
"# Configure the Google Generative AI module with the API key\n",
"genai.configure(api_key=google_api_key)\n",
"\n",
"# Initialize the FirecrawlApp with your API key\n",
"app = FirecrawlApp(api_key=firecrawl_api_key)\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No data returned from crawl.\n"
]
}
],
"source": [
"# Crawl a website\n",
"crawl_url = 'https://dify.ai/'\n",
"params = {\n",
" \n",
" 'crawlOptions': {\n",
" 'limit': 100\n",
" }\n",
"}\n",
"crawl_result = app.crawl_url(crawl_url, params=params)\n",
"\n",
"if crawl_result is not None:\n",
" # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
" cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
"\n",
" # Save the modified results as a text file containing JSON data\n",
" with open('crawl_result.txt', 'w') as file:\n",
" file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
"else:\n",
" print(\"No data returned from crawl.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Upload the video using the Files API\n",
"text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
"\n",
"# Wait for the file to finish processing\n",
"while text_file.state.name == \"PROCESSING\":\n",
" print('Waiting for file to be processed.')\n",
" time.sleep(2)\n",
" text_file = genai.get_file(text_file.name)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Create a cache with a 5 minute TTL\n",
"cache = caching.CachedContent.create(\n",
" model=\"models/gemini-1.5-flash-002\",\n",
" display_name=\"website crawl testing again\", # used to identify the cache\n",
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
" contents=[text_file],\n",
" ttl=datetime.timedelta(minutes=15),\n",
")\n",
"# Construct a GenerativeModel which uses the created cache.\n",
"model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
"\n",
"Here's how Firecrawl helps:\n",
"\n",
"* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
"* **Clean Output:** Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
"* **Parallel Crawling:** Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
"\n",
"You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
"\n",
"Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
"\n"
]
}
],
"source": [
"# Query the model\n",
"response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
"response_dict = response.to_dict()\n",
"response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
"print(response_text)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}