feat: enhance the firecrawl tool (#6705)
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions

This commit is contained in:
非法操作 2024-07-27 15:00:06 +08:00 committed by GitHub
parent 082c46a903
commit 21f6caacd4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 598 additions and 118 deletions

View File

@ -1,22 +1,19 @@
from core.tools.errors import ToolProviderCredentialValidationError
from core.tools.provider.builtin.firecrawl.tools.crawl import CrawlTool
from core.tools.provider.builtin.firecrawl.tools.scrape import ScrapeTool
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
class FirecrawlProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict) -> None:
try:
# Example validation using the Crawl tool
CrawlTool().fork_tool_runtime(
# Example validation using the ScrapeTool, only scraping title for minimize content
ScrapeTool().fork_tool_runtime(
runtime={"credentials": credentials}
).invoke(
user_id='',
tool_parameters={
"url": "https://example.com",
"includes": '',
"excludes": '',
"limit": 1,
"onlyMainContent": True,
"url": "https://google.com",
"onlyIncludeTags": 'title'
}
)
except Exception as e:

View File

@ -31,8 +31,5 @@ credentials_for_provider:
label:
en_US: Firecrawl server's Base URL
zh_Hans: Firecrawl服务器的API URL
pt_BR: Firecrawl server's Base URL
placeholder:
en_US: https://www.firecrawl.dev
zh_HansL: https://www.firecrawl.dev
pt_BR: https://www.firecrawl.dev
en_US: https://api.firecrawl.dev

View File

@ -1,3 +1,4 @@
import json
import logging
import time
from collections.abc import Mapping
@ -8,6 +9,7 @@ from requests.exceptions import HTTPError
logger = logging.getLogger(__name__)
class FirecrawlApp:
def __init__(self, api_key: str | None = None, base_url: str | None = None):
self.api_key = api_key
@ -25,14 +27,16 @@ class FirecrawlApp:
return headers
def _request(
self,
method: str,
url: str,
data: Mapping[str, Any] | None = None,
headers: Mapping[str, str] | None = None,
retries: int = 3,
backoff_factor: float = 0.3,
self,
method: str,
url: str,
data: Mapping[str, Any] | None = None,
headers: Mapping[str, str] | None = None,
retries: int = 3,
backoff_factor: float = 0.3,
) -> Mapping[str, Any] | None:
if not headers:
headers = self._prepare_headers()
for i in range(retries):
try:
response = requests.request(method, url, json=data, headers=headers)
@ -47,47 +51,51 @@ class FirecrawlApp:
def scrape_url(self, url: str, **kwargs):
endpoint = f'{self.base_url}/v0/scrape'
headers = self._prepare_headers()
data = {'url': url, **kwargs}
response = self._request('POST', endpoint, data, headers)
logger.debug(f"Sent request to {endpoint=} body={data}")
response = self._request('POST', endpoint, data)
if response is None:
raise HTTPError("Failed to scrape URL after multiple retries")
return response
def search(self, query: str, **kwargs):
endpoint = f'{self.base_url}/v0/search'
headers = self._prepare_headers()
data = {'query': query, **kwargs}
response = self._request('POST', endpoint, data, headers)
logger.debug(f"Sent request to {endpoint=} body={data}")
response = self._request('POST', endpoint, data)
if response is None:
raise HTTPError("Failed to perform search after multiple retries")
return response
def crawl_url(
self, url: str, wait: bool = False, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
self, url: str, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
):
endpoint = f'{self.base_url}/v0/crawl'
headers = self._prepare_headers(idempotency_key)
data = {'url': url, **kwargs['params']}
response = self._request('POST', endpoint, data, headers)
data = {'url': url, **kwargs}
logger.debug(f"Sent request to {endpoint=} body={data}")
response = self._request('POST', endpoint, data, headers)
if response is None:
raise HTTPError("Failed to initiate crawl after multiple retries")
job_id: str = response['jobId']
if wait:
return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval)
return job_id
return response
def check_crawl_status(self, job_id: str):
endpoint = f'{self.base_url}/v0/crawl/status/{job_id}'
headers = self._prepare_headers()
response = self._request('GET', endpoint, headers=headers)
response = self._request('GET', endpoint)
if response is None:
raise HTTPError(f"Failed to check status for job {job_id} after multiple retries")
return response
def cancel_crawl_job(self, job_id: str):
endpoint = f'{self.base_url}/v0/crawl/cancel/{job_id}'
response = self._request('DELETE', endpoint)
if response is None:
raise HTTPError(f"Failed to cancel job {job_id} after multiple retries")
return response
def _monitor_job_status(self, job_id: str, poll_interval: int):
while True:
status = self.check_crawl_status(job_id)
@ -96,3 +104,21 @@ class FirecrawlApp:
elif status['status'] == 'failed':
raise HTTPError(f'Job {job_id} failed: {status["error"]}')
time.sleep(poll_interval)
def get_array_params(tool_parameters: dict[str, Any], key):
param = tool_parameters.get(key)
if param:
return param.split(',')
def get_json_params(tool_parameters: dict[str, Any], key):
param = tool_parameters.get(key)
if param:
try:
# support both single quotes and double quotes
param = param.replace("'", '"')
param = json.loads(param)
except:
raise ValueError(f"Invalid {key} format.")
return param

View File

@ -1,36 +1,48 @@
import json
from typing import Any, Union
from typing import Any
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp, get_array_params, get_json_params
from core.tools.tool.builtin_tool import BuiltinTool
class CrawlTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
"""
the crawlerOptions and pageOptions comes from doc here:
https://docs.firecrawl.dev/api-reference/endpoint/crawl
"""
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
base_url=self.runtime.credentials['base_url'])
crawlerOptions = {}
pageOptions = {}
options = {
'crawlerOptions': {
'excludes': tool_parameters.get('excludes', '').split(',') if tool_parameters.get('excludes') else [],
'includes': tool_parameters.get('includes', '').split(',') if tool_parameters.get('includes') else [],
'limit': tool_parameters.get('limit', 5)
},
'pageOptions': {
'onlyMainContent': tool_parameters.get('onlyMainContent', False)
}
}
wait_for_results = tool_parameters.get('wait_for_results', True)
crawlerOptions['excludes'] = get_array_params(tool_parameters, 'excludes')
crawlerOptions['includes'] = get_array_params(tool_parameters, 'includes')
crawlerOptions['returnOnlyUrls'] = tool_parameters.get('returnOnlyUrls', False)
crawlerOptions['maxDepth'] = tool_parameters.get('maxDepth')
crawlerOptions['mode'] = tool_parameters.get('mode')
crawlerOptions['ignoreSitemap'] = tool_parameters.get('ignoreSitemap', False)
crawlerOptions['limit'] = tool_parameters.get('limit', 5)
crawlerOptions['allowBackwardCrawling'] = tool_parameters.get('allowBackwardCrawling', False)
crawlerOptions['allowExternalContentLinks'] = tool_parameters.get('allowExternalContentLinks', False)
pageOptions['headers'] = get_json_params(tool_parameters, 'headers')
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
pageOptions['onlyIncludeTags'] = get_array_params(tool_parameters, 'onlyIncludeTags')
pageOptions['removeTags'] = get_array_params(tool_parameters, 'removeTags')
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
pageOptions['replaceAllPathsWithAbsolutePaths'] = tool_parameters.get('replaceAllPathsWithAbsolutePaths', False)
pageOptions['screenshot'] = tool_parameters.get('screenshot', False)
pageOptions['waitFor'] = tool_parameters.get('waitFor', 0)
crawl_result = app.crawl_url(
url=tool_parameters['url'],
params=options,
wait=True
url=tool_parameters['url'],
wait=wait_for_results,
crawlerOptions=crawlerOptions,
pageOptions=pageOptions
)
if not isinstance(crawl_result, str):
crawl_result = json.dumps(crawl_result, ensure_ascii=False, indent=4)
if not crawl_result:
return self.create_text_message("Crawl request failed.")
return self.create_text_message(crawl_result)
return self.create_json_message(crawl_result)

View File

@ -3,76 +3,243 @@ identity:
author: Richards Tu
label:
en_US: Crawl
zh_Hans: 爬取
zh_Hans: 深度爬取
description:
human:
en_US: Extract data from a website by crawling through a URL.
zh_Hans: 通过URL从网站中提取数据
en_US: Recursively search through a urls subdomains, and gather the content.
zh_Hans: 递归爬取一个网址的子域名,并收集内容
llm: This tool initiates a web crawl to extract data from a specified URL. It allows configuring crawler options such as including or excluding URL patterns, generating alt text for images using LLMs (paid plan required), limiting the maximum number of pages to crawl, and returning only the main content of the page. The tool can return either a list of crawled documents or a list of URLs based on the provided options.
parameters:
- name: url
type: string
required: true
label:
en_US: URL to crawl
zh_Hans: 要爬取的URL
en_US: Start URL
zh_Hans: 起始URL
human_description:
en_US: The URL of the website to crawl and extract data from.
zh_Hans: 要爬取并提取数据的网站URL。
en_US: The base URL to start crawling from.
zh_Hans: 要爬取网站的起始URL。
llm_description: The URL of the website that needs to be crawled. This is a required parameter.
form: llm
- name: wait_for_results
type: boolean
default: true
label:
en_US: Wait For Results
zh_Hans: 等待爬取结果
human_description:
en_US: If you choose not to wait, it will directly return a job ID. You can use this job ID to check the crawling results or cancel the crawling task, which is usually very useful for a large-scale crawling task.
zh_Hans: 如果选择不等待则会直接返回一个job_id可以通过job_id查询爬取结果或取消爬取任务这通常对于一个大型爬取任务来说非常有用。
form: form
############## Crawl Options #######################
- name: includes
type: string
required: false
label:
en_US: URL patterns to include
zh_Hans: 要包含的URL模式
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
human_description:
en_US: Specify URL patterns to include during the crawl. Only pages matching these patterns will be crawled, you can use ',' to separate multiple patterns.
zh_Hans: 指定爬取过程中要包含的URL模式。只有与这些模式匹配的页面才会被爬取。
en_US: |
Only pages matching these patterns will be crawled. Example: blog/*, about/*
zh_Hans: 只有与这些模式匹配的页面才会被爬取。示例blog/*, about/*
form: form
default: ''
- name: excludes
type: string
required: false
label:
en_US: URL patterns to exclude
zh_Hans: 要排除的URL模式
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
human_description:
en_US: Specify URL patterns to exclude during the crawl. Pages matching these patterns will be skipped, you can use ',' to separate multiple patterns.
zh_Hans: 指定爬取过程中要排除的URL模式。匹配这些模式的页面将被跳过。
en_US: |
Pages matching these patterns will be skipped. Example: blog/*, about/*
zh_Hans: 匹配这些模式的页面将被跳过。示例blog/*, about/*
form: form
- name: returnOnlyUrls
type: boolean
default: false
label:
en_US: return Only Urls
zh_Hans: 仅返回URL
human_description:
en_US: |
If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.
zh_Hans: 只返回爬取到的网页链接,而不是网页内容本身。
form: form
- name: maxDepth
type: number
label:
en_US: Maximum crawl depth
zh_Hans: 爬取深度
human_description:
en_US: Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern.
zh_Hans: 相对于输入的URL爬取的最大深度。maxDepth为0时仅抓取输入的URL。maxDepth为1时抓取输入的URL以及所有一级深层页面。maxDepth为2时抓取输入的URL以及所有两级深层页面。更高值遵循相同模式。
form: form
min: 0
- name: mode
type: select
required: false
form: form
options:
- value: default
label:
en_US: default
- value: fast
label:
en_US: fast
default: default
label:
en_US: Crawl Mode
zh_Hans: 爬取模式
human_description:
en_US: The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.
zh_Hans: 使用fast模式将不会使用其站点地图比普通模式快4倍但是可能不够准确也不适用于大量js渲染的网站。
- name: ignoreSitemap
type: boolean
default: false
label:
en_US: ignore Sitemap
zh_Hans: 忽略站点地图
human_description:
en_US: Ignore the website sitemap when crawling.
zh_Hans: 爬取时忽略网站站点地图。
form: form
default: 'blog/*'
- name: limit
type: number
required: false
label:
en_US: Maximum number of pages to crawl
en_US: Maximum pages to crawl
zh_Hans: 最大爬取页面数
human_description:
en_US: Specify the maximum number of pages to crawl. The crawler will stop after reaching this limit.
zh_Hans: 指定要爬取的最大页面数。爬虫将在达到此限制后停止。
form: form
min: 1
max: 20
default: 5
- name: allowBackwardCrawling
type: boolean
default: false
label:
en_US: allow Backward Crawling
zh_Hans: 允许向后爬取
human_description:
en_US: Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'
zh_Hans: 使爬虫能够从特定URL导航到之前链接的页面。例如从'example.com/product/123'返回到'example.com/product'
form: form
- name: allowExternalContentLinks
type: boolean
default: false
label:
en_US: allow External Content Links
zh_Hans: 允许爬取外链
human_description:
en_US: Allows the crawler to follow links to external websites.
zh_Hans:
form: form
############## Page Options #######################
- name: headers
type: string
label:
en_US: headers
zh_Hans: 请求头
human_description:
en_US: |
Headers to send with the request. Can be used to send cookies, user-agent, etc. Example: {"cookies": "testcookies"}
zh_Hans: |
随请求发送的头部。可以用来发送cookies、用户代理等。示例{"cookies": "testcookies"}
placeholder:
en_US: Please enter an object that can be serialized in JSON
zh_Hans: 请输入可以json序列化的对象
form: form
- name: includeHtml
type: boolean
default: false
label:
en_US: include Html
zh_Hans: 包含HTML
human_description:
en_US: Include the HTML version of the content on page. Will output a html key in the response.
zh_Hans: 返回中包含一个HTML版本的内容将以html键返回。
form: form
- name: includeRawHtml
type: boolean
default: false
label:
en_US: include Raw Html
zh_Hans: 包含原始HTML
human_description:
en_US: Include the raw HTML content of the page. Will output a rawHtml key in the response.
zh_Hans: 返回中包含一个原始HTML版本的内容将以rawHtml键返回。
form: form
- name: onlyIncludeTags
type: string
label:
en_US: only Include Tags
zh_Hans: 仅抓取这些标签
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
human_description:
en_US: |
Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: script, .ad, #footer
zh_Hans: |
仅在最终输出中包含HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
form: form
- name: onlyMainContent
type: boolean
required: false
default: false
label:
en_US: Only return the main content of the page
zh_Hans: 仅返回页面的主要内容
en_US: only Main Content
zh_Hans: 抓取主要内容
human_description:
en_US: If enabled, the crawler will only return the main content of the page, excluding headers, navigation, footers, etc.
zh_Hans: 如果启用,爬虫将仅返回页面的主要内容,不包括标题、导航、页脚等。
en_US: Only return the main content of the page excluding headers, navs, footers, etc.
zh_Hans: 只返回页面的主要内容,不包括头部、导航栏、尾部等。
form: form
- name: removeTags
type: string
label:
en_US: remove Tags
zh_Hans: 要移除这些标签
human_description:
en_US: |
Tags, classes and ids to remove from the page. Use comma separated values. Example: script, .ad, #footer
zh_Hans: |
要在最终输出中移除HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
form: form
- name: replaceAllPathsWithAbsolutePaths
type: boolean
default: false
label:
en_US: All AbsolutePaths
zh_Hans: 使用绝对路径
human_description:
en_US: Replace all relative paths with absolute paths for images and links.
zh_Hans: 将所有图片和链接的相对路径替换为绝对路径。
form: form
- name: screenshot
type: boolean
default: false
label:
en_US: screenshot
zh_Hans: 截图
human_description:
en_US: Include a screenshot of the top of the page that you are scraping.
zh_Hans: 提供正在抓取的页面的顶部的截图。
form: form
- name: waitFor
type: number
min: 0
label:
en_US: wait For
zh_Hans: 等待时间
human_description:
en_US: Wait x amount of milliseconds for the page to load to fetch content.
zh_Hans: 等待x毫秒以使页面加载并获取内容。
form: form
options:
- value: 'true'
label:
en_US: 'Yes'
zh_Hans:
- value: 'false'
label:
en_US: 'No'
zh_Hans:
default: 'false'

View File

@ -0,0 +1,20 @@
from typing import Any
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
from core.tools.tool.builtin_tool import BuiltinTool
class CrawlJobTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
base_url=self.runtime.credentials['base_url'])
operation = tool_parameters.get('operation', 'get')
if operation == 'get':
result = app.check_crawl_status(job_id=tool_parameters['job_id'])
elif operation == 'cancel':
result = app.cancel_crawl_job(job_id=tool_parameters['job_id'])
else:
raise ValueError(f'Invalid operation: {operation}')
return self.create_json_message(result)

View File

@ -0,0 +1,37 @@
identity:
name: crawl_job
author: hjlarry
label:
en_US: Crawl Job
zh_Hans: 爬取任务处理
description:
human:
en_US: Retrieve the scraping results based on the job ID, or cancel the scraping task.
zh_Hans: 根据爬取任务ID获取爬取结果或者取消爬取任务
llm: Retrieve the scraping results based on the job ID, or cancel the scraping task.
parameters:
- name: job_id
type: string
required: true
label:
en_US: Job ID
human_description:
en_US: Set wait_for_results to false in the Crawl tool can get the job ID.
zh_Hans: 在深度爬取工具中将等待爬取结果设置为否可以获取Job ID。
llm_description: Set wait_for_results to false in the Crawl tool can get the job ID.
form: llm
- name: operation
type: select
required: true
options:
- value: get
label:
en_US: get crawl status
- value: cancel
label:
en_US: cancel crawl job
label:
en_US: operation
zh_Hans: 操作
llm_description: choose the operation to perform. `get` is for getting the crawl status, `cancel` is for cancelling the crawl job.
form: llm

View File

@ -1,26 +1,39 @@
import json
from typing import Any, Union
from typing import Any
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp, get_array_params, get_json_params
from core.tools.tool.builtin_tool import BuiltinTool
class ScrapeTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])
crawl_result = app.scrape_url(
url=tool_parameters['url'],
wait=True
)
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
"""
the pageOptions and extractorOptions comes from doc here:
https://docs.firecrawl.dev/api-reference/endpoint/scrape
"""
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
base_url=self.runtime.credentials['base_url'])
if isinstance(crawl_result, dict):
result_message = json.dumps(crawl_result, ensure_ascii=False, indent=4)
else:
result_message = str(crawl_result)
pageOptions = {}
extractorOptions = {}
if not crawl_result:
return self.create_text_message("Scrape request failed.")
pageOptions['headers'] = get_json_params(tool_parameters, 'headers')
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
pageOptions['onlyIncludeTags'] = get_array_params(tool_parameters, 'onlyIncludeTags')
pageOptions['removeTags'] = get_array_params(tool_parameters, 'removeTags')
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
pageOptions['replaceAllPathsWithAbsolutePaths'] = tool_parameters.get('replaceAllPathsWithAbsolutePaths', False)
pageOptions['screenshot'] = tool_parameters.get('screenshot', False)
pageOptions['waitFor'] = tool_parameters.get('waitFor', 0)
return self.create_text_message(result_message)
extractorOptions['mode'] = tool_parameters.get('mode', '')
extractorOptions['extractionPrompt'] = tool_parameters.get('extractionPrompt', '')
extractorOptions['extractionSchema'] = get_json_params(tool_parameters, 'extractionSchema')
crawl_result = app.scrape_url(url=tool_parameters['url'],
pageOptions=pageOptions,
extractorOptions=extractorOptions)
return self.create_json_message(crawl_result)

View File

@ -3,7 +3,7 @@ identity:
author: ahasasjeb
label:
en_US: Scrape
zh_Hans: 抓取
zh_Hans: 单页面抓取
description:
human:
en_US: Extract data from a single URL.
@ -21,3 +21,160 @@ parameters:
zh_Hans: 要抓取并提取数据的网站URL。
llm_description: The URL of the website that needs to be crawled. This is a required parameter.
form: llm
############## Page Options #######################
- name: headers
type: string
label:
en_US: headers
zh_Hans: 请求头
human_description:
en_US: |
Headers to send with the request. Can be used to send cookies, user-agent, etc. Example: {"cookies": "testcookies"}
zh_Hans: |
随请求发送的头部。可以用来发送cookies、用户代理等。示例{"cookies": "testcookies"}
placeholder:
en_US: Please enter an object that can be serialized in JSON
zh_Hans: 请输入可以json序列化的对象
form: form
- name: includeHtml
type: boolean
default: false
label:
en_US: include Html
zh_Hans: 包含HTML
human_description:
en_US: Include the HTML version of the content on page. Will output a html key in the response.
zh_Hans: 返回中包含一个HTML版本的内容将以html键返回。
form: form
- name: includeRawHtml
type: boolean
default: false
label:
en_US: include Raw Html
zh_Hans: 包含原始HTML
human_description:
en_US: Include the raw HTML content of the page. Will output a rawHtml key in the response.
zh_Hans: 返回中包含一个原始HTML版本的内容将以rawHtml键返回。
form: form
- name: onlyIncludeTags
type: string
label:
en_US: only Include Tags
zh_Hans: 仅抓取这些标签
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
human_description:
en_US: |
Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: script, .ad, #footer
zh_Hans: |
仅在最终输出中包含HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
form: form
- name: onlyMainContent
type: boolean
default: false
label:
en_US: only Main Content
zh_Hans: 仅抓取主要内容
human_description:
en_US: Only return the main content of the page excluding headers, navs, footers, etc.
zh_Hans: 只返回页面的主要内容,不包括头部、导航栏、尾部等。
form: form
- name: removeTags
type: string
label:
en_US: remove Tags
zh_Hans: 要移除这些标签
human_description:
en_US: |
Tags, classes and ids to remove from the page. Use comma separated values. Example: script, .ad, #footer
zh_Hans: |
要在最终输出中移除HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
form: form
- name: replaceAllPathsWithAbsolutePaths
type: boolean
default: false
label:
en_US: All AbsolutePaths
zh_Hans: 使用绝对路径
human_description:
en_US: Replace all relative paths with absolute paths for images and links.
zh_Hans: 将所有图片和链接的相对路径替换为绝对路径。
form: form
- name: screenshot
type: boolean
default: false
label:
en_US: screenshot
zh_Hans: 截图
human_description:
en_US: Include a screenshot of the top of the page that you are scraping.
zh_Hans: 提供正在抓取的页面的顶部的截图。
form: form
- name: waitFor
type: number
min: 0
label:
en_US: wait For
zh_Hans: 等待时间
human_description:
en_US: Wait x amount of milliseconds for the page to load to fetch content.
zh_Hans: 等待x毫秒以使页面加载并获取内容。
form: form
############## Extractor Options #######################
- name: mode
type: select
options:
- value: markdown
label:
en_US: markdown
- value: llm-extraction
label:
en_US: llm-extraction
- value: llm-extraction-from-raw-html
label:
en_US: llm-extraction-from-raw-html
- value: llm-extraction-from-markdown
label:
en_US: llm-extraction-from-markdown
label:
en_US: Extractor Mode
zh_Hans: 提取模式
human_description:
en_US: |
The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM.
zh_Hans: 使用的提取模式。“markdown”返回抓取的markdown内容不执行LLM提取。“llm-extractioin”使用LLM按Extractor Schema从内容中提取信息。
form: form
- name: extractionPrompt
type: string
label:
en_US: Extractor Prompt
zh_Hans: 提取时的提示词
human_description:
en_US: A prompt describing what information to extract from the page, applicable for LLM extraction modes.
zh_Hans: 当使用LLM提取模式时用于给LLM描述提取规则。
form: form
- name: extractionSchema
type: string
label:
en_US: Extractor Schema
zh_Hans: 提取时的结构
placeholder:
en_US: Please enter an object that can be serialized in JSON
human_description:
en_US: |
The schema for the data to be extracted, required only for LLM extraction modes. Example: {
"type": "object",
"properties": {"company_mission": {"type": "string"}},
"required": ["company_mission"]
}
zh_Hans: |
当使用LLM提取模式时使用该结构去提取示例{
"type": "object",
"properties": {"company_mission": {"type": "string"}},
"required": ["company_mission"]
}
form: form

View File

@ -1,5 +1,4 @@
import json
from typing import Any, Union
from typing import Any
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
@ -7,20 +6,23 @@ from core.tools.tool.builtin_tool import BuiltinTool
class SearchTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])
crawl_result = app.search(
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
"""
the pageOptions and searchOptions comes from doc here:
https://docs.firecrawl.dev/api-reference/endpoint/search
"""
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
base_url=self.runtime.credentials['base_url'])
pageOptions = {}
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
pageOptions['fetchPageContent'] = tool_parameters.get('fetchPageContent', True)
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
searchOptions = {'limit': tool_parameters.get('limit')}
search_result = app.search(
query=tool_parameters['keyword'],
wait=True
pageOptions=pageOptions,
searchOptions=searchOptions
)
if isinstance(crawl_result, dict):
result_message = json.dumps(crawl_result, ensure_ascii=False, indent=4)
else:
result_message = str(crawl_result)
if not crawl_result:
return self.create_text_message("Search request failed.")
return self.create_text_message(result_message)
return self.create_json_message(search_result)

View File

@ -21,3 +21,55 @@ parameters:
zh_Hans: 输入关键词即可使用Firecrawl API进行搜索。
llm_description: Efficiently extract keywords from user text.
form: llm
############## Page Options #######################
- name: onlyMainContent
type: boolean
default: false
label:
en_US: only Main Content
zh_Hans: 仅抓取主要内容
human_description:
en_US: Only return the main content of the page excluding headers, navs, footers, etc.
zh_Hans: 只返回页面的主要内容,不包括头部、导航栏、尾部等。
form: form
- name: fetchPageContent
type: boolean
default: true
label:
en_US: fetch Page Content
zh_Hans: 抓取页面内容
human_description:
en_US: Fetch the content of each page. If false, defaults to a basic fast serp API.
zh_Hans: 获取每个页面的内容。如果为否则使用基本的快速搜索结果页面API。
form: form
- name: includeHtml
type: boolean
default: false
label:
en_US: include Html
zh_Hans: 包含HTML
human_description:
en_US: Include the HTML version of the content on page. Will output a html key in the response.
zh_Hans: 返回中包含一个HTML版本的内容将以html键返回。
form: form
- name: includeRawHtml
type: boolean
default: false
label:
en_US: include Raw Html
zh_Hans: 包含原始HTML
human_description:
en_US: Include the raw HTML content of the page. Will output a rawHtml key in the response.
zh_Hans: 返回中包含一个原始HTML版本的内容将以rawHtml键返回。
form: form
############## Search Options #######################
- name: limit
type: number
min: 0
label:
en_US: Maximum results
zh_Hans: 最大结果数量
human_description:
en_US: Maximum number of results. Max is 20 during beta.
zh_Hans: 最大结果数量。在测试阶段最大为20。
form: form