mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Merge branch 'main' into nsc/geo-to-location
This commit is contained in:
commit
07e76f2ba5
16
README.md
16
README.md
|
@ -80,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
|
|||
- **Media parsing**: pdfs, docx, images.
|
||||
- **Reliability first**: designed to get the data you need - no matter how hard it is.
|
||||
- **Actions**: click, scroll, input, wait and more before extracting data
|
||||
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
|
||||
|
||||
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
|
||||
|
||||
|
@ -350,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
|
|||
}'
|
||||
```
|
||||
|
||||
### Batch Scraping Multiple URLs (New)
|
||||
|
||||
You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"],
|
||||
"formats" : ["markdown", "html"]
|
||||
}'
|
||||
```
|
||||
|
||||
### Search (v0) (Beta)
|
||||
|
||||
|
@ -483,7 +497,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
|||
scrapeOptions: {
|
||||
formats: ['markdown', 'html'],
|
||||
}
|
||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||
} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse;
|
||||
|
||||
if (crawlResponse) {
|
||||
console.log(crawlResponse)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# ===== Required ENVS ======
|
||||
NUM_WORKERS_PER_QUEUE=8
|
||||
NUM_WORKERS_PER_QUEUE=8
|
||||
PORT=3002
|
||||
HOST=0.0.0.0
|
||||
REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
||||
|
@ -11,9 +11,14 @@ USE_DB_AUTHENTICATION=true
|
|||
|
||||
# ===== Optional ENVS ======
|
||||
|
||||
# SearchApi key. Head to https://searchapi.com/ to get your API key
|
||||
SEARCHAPI_API_KEY=
|
||||
# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
|
||||
SEARCHAPI_ENGINE=
|
||||
|
||||
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
|
||||
SUPABASE_ANON_TOKEN=
|
||||
SUPABASE_URL=
|
||||
SUPABASE_ANON_TOKEN=
|
||||
SUPABASE_URL=
|
||||
SUPABASE_SERVICE_TOKEN=
|
||||
|
||||
# Other Optionals
|
||||
|
|
|
@ -12,4 +12,4 @@ ANTHROPIC_API_KEY=
|
|||
BULL_AUTH_KEY=
|
||||
LOGTAIL_KEY=
|
||||
PLAYWRIGHT_MICROSERVICE_URL=
|
||||
|
||||
SEARCHAPI_API_KEY=
|
||||
|
|
|
@ -4,6 +4,7 @@ import {
|
|||
BatchScrapeRequest,
|
||||
batchScrapeRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
|
@ -34,6 +35,8 @@ export async function batchScrapeController(
|
|||
}
|
||||
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
crawlerOptions: null,
|
||||
|
@ -65,6 +68,7 @@ export async function batchScrapeController(
|
|||
plan: req.auth.plan,
|
||||
crawlerOptions: null,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
|
|
|
@ -121,8 +121,13 @@ export async function runWebScraper({
|
|||
: docs;
|
||||
|
||||
if(is_scrape === false) {
|
||||
billTeam(team_id, undefined, filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
|
||||
billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
|
|
@ -209,14 +209,15 @@ export async function scrapSingleUrl(
|
|||
if (action.type === "click" || action.type === "write" || action.type === "press") {
|
||||
const result: Action[] = [];
|
||||
// Don't add a wait if the previous action is a wait
|
||||
if (index === 0 || array[index - 1].type !== "wait") {
|
||||
result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
}
|
||||
// if (index === 0 || array[index - 1].type !== "wait") {
|
||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
// }
|
||||
// Fire-engine now handles wait times automatically, leaving the code here for now
|
||||
result.push(action);
|
||||
// Don't add a wait if the next action is a wait
|
||||
if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
||||
result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
}
|
||||
// if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
// }
|
||||
return result;
|
||||
}
|
||||
return [action as Action];
|
||||
|
|
|
@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger";
|
|||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { googleSearch } from "./googlesearch";
|
||||
import { fireEngineMap } from "./fireEngine";
|
||||
import { searchapi_search } from "./searchapi";
|
||||
import { serper_search } from "./serper";
|
||||
|
||||
export async function search({
|
||||
|
@ -30,7 +31,16 @@ export async function search({
|
|||
timeout?: number;
|
||||
}): Promise<SearchResult[]> {
|
||||
try {
|
||||
|
||||
if (process.env.SEARCHAPI_API_KEY) {
|
||||
return await searchapi_search(query, {
|
||||
num_results,
|
||||
tbs,
|
||||
filter,
|
||||
lang,
|
||||
country,
|
||||
location
|
||||
});
|
||||
}
|
||||
if (process.env.SERPER_API_KEY) {
|
||||
return await serper_search(query, {
|
||||
num_results,
|
||||
|
|
60
apps/api/src/search/searchapi.ts
Normal file
60
apps/api/src/search/searchapi.ts
Normal file
|
@ -0,0 +1,60 @@
|
|||
import axios from "axios";
|
||||
import dotenv from "dotenv";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
interface SearchOptions {
|
||||
tbs?: string;
|
||||
filter?: string;
|
||||
lang?: string;
|
||||
country?: string;
|
||||
location?: string;
|
||||
num_results: number;
|
||||
page?: number;
|
||||
}
|
||||
|
||||
export async function searchapi_search(q: string, options: SearchOptions): Promise<SearchResult[]> {
|
||||
const params = {
|
||||
q: q,
|
||||
hl: options.lang,
|
||||
gl: options.country,
|
||||
location: options.location,
|
||||
num: options.num_results,
|
||||
page: options.page ?? 1,
|
||||
engine: process.env.SEARCHAPI_ENGINE || "google",
|
||||
};
|
||||
|
||||
const url = `https://www.searchapi.io/api/v1/search`;
|
||||
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
"Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`,
|
||||
"Content-Type": "application/json",
|
||||
"X-SearchApi-Source": "Firecrawl",
|
||||
},
|
||||
params: params,
|
||||
});
|
||||
|
||||
|
||||
if (response.status === 401) {
|
||||
throw new Error("Unauthorized. Please check your API key.");
|
||||
}
|
||||
|
||||
const data = response.data;
|
||||
|
||||
if (data && Array.isArray(data.organic_results)) {
|
||||
return data.organic_results.map((a: any) => ({
|
||||
url: a.link,
|
||||
title: a.title,
|
||||
description: a.snippet,
|
||||
}));
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`There was an error searching for content: ${error.message}`);
|
||||
return [];
|
||||
}
|
||||
}
|
|
@ -3,6 +3,7 @@ from firecrawl import FirecrawlApp
|
|||
import json
|
||||
from dotenv import load_dotenv
|
||||
import anthropic
|
||||
import agentops
|
||||
|
||||
# ANSI color codes
|
||||
class Colors:
|
||||
|
@ -161,4 +162,5 @@ def main():
|
|||
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
agentops.init(os.getenv("AGENTOPS_API_KEY"))
|
||||
main()
|
||||
|
|
|
@ -98,7 +98,7 @@
|
|||
"source": [
|
||||
"# Create a cache with a 5 minute TTL\n",
|
||||
"cache = caching.CachedContent.create(\n",
|
||||
" model=\"models/gemini-1.5-pro-001\",\n",
|
||||
" model=\"models/gemini-1.5-pro-002\",\n",
|
||||
" display_name=\"website crawl testing again\", # used to identify the cache\n",
|
||||
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
|
||||
" contents=[text_file],\n",
|
||||
|
|
|
@ -0,0 +1,166 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import datetime\n",
|
||||
"import time\n",
|
||||
"import google.generativeai as genai\n",
|
||||
"from google.generativeai import caching\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from firecrawl import FirecrawlApp\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# Load environment variables\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"# Retrieve API keys from environment variables\n",
|
||||
"google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
|
||||
"firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
|
||||
"\n",
|
||||
"# Configure the Google Generative AI module with the API key\n",
|
||||
"genai.configure(api_key=google_api_key)\n",
|
||||
"\n",
|
||||
"# Initialize the FirecrawlApp with your API key\n",
|
||||
"app = FirecrawlApp(api_key=firecrawl_api_key)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"No data returned from crawl.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Crawl a website\n",
|
||||
"crawl_url = 'https://dify.ai/'\n",
|
||||
"params = {\n",
|
||||
" \n",
|
||||
" 'crawlOptions': {\n",
|
||||
" 'limit': 100\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"crawl_result = app.crawl_url(crawl_url, params=params)\n",
|
||||
"\n",
|
||||
"if crawl_result is not None:\n",
|
||||
" # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
|
||||
" cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
|
||||
"\n",
|
||||
" # Save the modified results as a text file containing JSON data\n",
|
||||
" with open('crawl_result.txt', 'w') as file:\n",
|
||||
" file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
|
||||
"else:\n",
|
||||
" print(\"No data returned from crawl.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upload the video using the Files API\n",
|
||||
"text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
|
||||
"\n",
|
||||
"# Wait for the file to finish processing\n",
|
||||
"while text_file.state.name == \"PROCESSING\":\n",
|
||||
" print('Waiting for file to be processed.')\n",
|
||||
" time.sleep(2)\n",
|
||||
" text_file = genai.get_file(text_file.name)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a cache with a 5 minute TTL\n",
|
||||
"cache = caching.CachedContent.create(\n",
|
||||
" model=\"models/gemini-1.5-flash-002\",\n",
|
||||
" display_name=\"website crawl testing again\", # used to identify the cache\n",
|
||||
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
|
||||
" contents=[text_file],\n",
|
||||
" ttl=datetime.timedelta(minutes=15),\n",
|
||||
")\n",
|
||||
"# Construct a GenerativeModel which uses the created cache.\n",
|
||||
"model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
|
||||
"\n",
|
||||
"Here's how Firecrawl helps:\n",
|
||||
"\n",
|
||||
"* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
|
||||
"* **Clean Output:** Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
|
||||
"* **Parallel Crawling:** Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
|
||||
"\n",
|
||||
"You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
|
||||
"\n",
|
||||
"Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Query the model\n",
|
||||
"response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
|
||||
"response_dict = response.to_dict()\n",
|
||||
"response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
|
||||
"print(response_text)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in New Issue
Block a user