chore: update firecrawl scrape to V1 api (#8367)

This commit is contained in:
非法操作 2024-09-13 20:02:00 +08:00 committed by GitHub
parent cd3eaed335
commit 06b66216d7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 287 additions and 378 deletions

View File

@ -37,9 +37,8 @@ class FirecrawlApp:
for i in range(retries): for i in range(retries):
try: try:
response = requests.request(method, url, json=data, headers=headers) response = requests.request(method, url, json=data, headers=headers)
response.raise_for_status()
return response.json() return response.json()
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException:
if i < retries - 1: if i < retries - 1:
time.sleep(backoff_factor * (2**i)) time.sleep(backoff_factor * (2**i))
else: else:
@ -47,7 +46,7 @@ class FirecrawlApp:
return None return None
def scrape_url(self, url: str, **kwargs): def scrape_url(self, url: str, **kwargs):
endpoint = f"{self.base_url}/v0/scrape" endpoint = f"{self.base_url}/v1/scrape"
data = {"url": url, **kwargs} data = {"url": url, **kwargs}
logger.debug(f"Sent request to {endpoint=} body={data}") logger.debug(f"Sent request to {endpoint=} body={data}")
response = self._request("POST", endpoint, data) response = self._request("POST", endpoint, data)
@ -55,39 +54,41 @@ class FirecrawlApp:
raise HTTPError("Failed to scrape URL after multiple retries") raise HTTPError("Failed to scrape URL after multiple retries")
return response return response
def search(self, query: str, **kwargs): def map(self, url: str, **kwargs):
endpoint = f"{self.base_url}/v0/search" endpoint = f"{self.base_url}/v1/map"
data = {"query": query, **kwargs} data = {"url": url, **kwargs}
logger.debug(f"Sent request to {endpoint=} body={data}") logger.debug(f"Sent request to {endpoint=} body={data}")
response = self._request("POST", endpoint, data) response = self._request("POST", endpoint, data)
if response is None: if response is None:
raise HTTPError("Failed to perform search after multiple retries") raise HTTPError("Failed to perform map after multiple retries")
return response return response
def crawl_url( def crawl_url(
self, url: str, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs self, url: str, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
): ):
endpoint = f"{self.base_url}/v0/crawl" endpoint = f"{self.base_url}/v1/crawl"
headers = self._prepare_headers(idempotency_key) headers = self._prepare_headers(idempotency_key)
data = {"url": url, **kwargs} data = {"url": url, **kwargs}
logger.debug(f"Sent request to {endpoint=} body={data}") logger.debug(f"Sent request to {endpoint=} body={data}")
response = self._request("POST", endpoint, data, headers) response = self._request("POST", endpoint, data, headers)
if response is None: if response is None:
raise HTTPError("Failed to initiate crawl after multiple retries") raise HTTPError("Failed to initiate crawl after multiple retries")
job_id: str = response["jobId"] elif response.get("success") == False:
raise HTTPError(f'Failed to crawl: {response.get("error")}')
job_id: str = response["id"]
if wait: if wait:
return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval) return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval)
return response return response
def check_crawl_status(self, job_id: str): def check_crawl_status(self, job_id: str):
endpoint = f"{self.base_url}/v0/crawl/status/{job_id}" endpoint = f"{self.base_url}/v1/crawl/{job_id}"
response = self._request("GET", endpoint) response = self._request("GET", endpoint)
if response is None: if response is None:
raise HTTPError(f"Failed to check status for job {job_id} after multiple retries") raise HTTPError(f"Failed to check status for job {job_id} after multiple retries")
return response return response
def cancel_crawl_job(self, job_id: str): def cancel_crawl_job(self, job_id: str):
endpoint = f"{self.base_url}/v0/crawl/cancel/{job_id}" endpoint = f"{self.base_url}/v1/crawl/{job_id}"
response = self._request("DELETE", endpoint) response = self._request("DELETE", endpoint)
if response is None: if response is None:
raise HTTPError(f"Failed to cancel job {job_id} after multiple retries") raise HTTPError(f"Failed to cancel job {job_id} after multiple retries")
@ -116,6 +117,6 @@ def get_json_params(tool_parameters: dict[str, Any], key):
# support both single quotes and double quotes # support both single quotes and double quotes
param = param.replace("'", '"') param = param.replace("'", '"')
param = json.loads(param) param = json.loads(param)
except: except Exception:
raise ValueError(f"Invalid {key} format.") raise ValueError(f"Invalid {key} format.")
return param return param

View File

@ -8,39 +8,38 @@ from core.tools.tool.builtin_tool import BuiltinTool
class CrawlTool(BuiltinTool): class CrawlTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage: def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
""" """
the crawlerOptions and pageOptions comes from doc here: the api doc:
https://docs.firecrawl.dev/api-reference/endpoint/crawl https://docs.firecrawl.dev/api-reference/endpoint/crawl
""" """
app = FirecrawlApp( app = FirecrawlApp(
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"] api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
) )
crawlerOptions = {}
pageOptions = {} scrapeOptions = {}
payload = {}
wait_for_results = tool_parameters.get("wait_for_results", True) wait_for_results = tool_parameters.get("wait_for_results", True)
crawlerOptions["excludes"] = get_array_params(tool_parameters, "excludes") payload["excludePaths"] = get_array_params(tool_parameters, "excludePaths")
crawlerOptions["includes"] = get_array_params(tool_parameters, "includes") payload["includePaths"] = get_array_params(tool_parameters, "includePaths")
crawlerOptions["returnOnlyUrls"] = tool_parameters.get("returnOnlyUrls", False) payload["maxDepth"] = tool_parameters.get("maxDepth")
crawlerOptions["maxDepth"] = tool_parameters.get("maxDepth") payload["ignoreSitemap"] = tool_parameters.get("ignoreSitemap", False)
crawlerOptions["mode"] = tool_parameters.get("mode") payload["limit"] = tool_parameters.get("limit", 5)
crawlerOptions["ignoreSitemap"] = tool_parameters.get("ignoreSitemap", False) payload["allowBackwardLinks"] = tool_parameters.get("allowBackwardLinks", False)
crawlerOptions["limit"] = tool_parameters.get("limit", 5) payload["allowExternalLinks"] = tool_parameters.get("allowExternalLinks", False)
crawlerOptions["allowBackwardCrawling"] = tool_parameters.get("allowBackwardCrawling", False) payload["webhook"] = tool_parameters.get("webhook")
crawlerOptions["allowExternalContentLinks"] = tool_parameters.get("allowExternalContentLinks", False)
pageOptions["headers"] = get_json_params(tool_parameters, "headers") scrapeOptions["formats"] = get_array_params(tool_parameters, "formats")
pageOptions["includeHtml"] = tool_parameters.get("includeHtml", False) scrapeOptions["headers"] = get_json_params(tool_parameters, "headers")
pageOptions["includeRawHtml"] = tool_parameters.get("includeRawHtml", False) scrapeOptions["includeTags"] = get_array_params(tool_parameters, "includeTags")
pageOptions["onlyIncludeTags"] = get_array_params(tool_parameters, "onlyIncludeTags") scrapeOptions["excludeTags"] = get_array_params(tool_parameters, "excludeTags")
pageOptions["removeTags"] = get_array_params(tool_parameters, "removeTags") scrapeOptions["onlyMainContent"] = tool_parameters.get("onlyMainContent", False)
pageOptions["onlyMainContent"] = tool_parameters.get("onlyMainContent", False) scrapeOptions["waitFor"] = tool_parameters.get("waitFor", 0)
pageOptions["replaceAllPathsWithAbsolutePaths"] = tool_parameters.get("replaceAllPathsWithAbsolutePaths", False) scrapeOptions = {k: v for k, v in scrapeOptions.items() if v not in (None, "")}
pageOptions["screenshot"] = tool_parameters.get("screenshot", False) payload["scrapeOptions"] = scrapeOptions or None
pageOptions["waitFor"] = tool_parameters.get("waitFor", 0)
crawl_result = app.crawl_url( payload = {k: v for k, v in payload.items() if v not in (None, "")}
url=tool_parameters["url"], wait=wait_for_results, crawlerOptions=crawlerOptions, pageOptions=pageOptions
) crawl_result = app.crawl_url(url=tool_parameters["url"], wait=wait_for_results, **payload)
return self.create_json_message(crawl_result) return self.create_json_message(crawl_result)

View File

@ -31,8 +31,21 @@ parameters:
en_US: If you choose not to wait, it will directly return a job ID. You can use this job ID to check the crawling results or cancel the crawling task, which is usually very useful for a large-scale crawling task. en_US: If you choose not to wait, it will directly return a job ID. You can use this job ID to check the crawling results or cancel the crawling task, which is usually very useful for a large-scale crawling task.
zh_Hans: 如果选择不等待则会直接返回一个job_id可以通过job_id查询爬取结果或取消爬取任务这通常对于一个大型爬取任务来说非常有用。 zh_Hans: 如果选择不等待则会直接返回一个job_id可以通过job_id查询爬取结果或取消爬取任务这通常对于一个大型爬取任务来说非常有用。
form: form form: form
############## Crawl Options ####################### ############## Payload #######################
- name: includes - name: excludePaths
type: string
label:
en_US: URL patterns to exclude
zh_Hans: 要排除的URL模式
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
human_description:
en_US: |
Pages matching these patterns will be skipped. Example: blog/*, about/*
zh_Hans: 匹配这些模式的页面将被跳过。示例blog/*, about/*
form: form
- name: includePaths
type: string type: string
required: false required: false
label: label:
@ -46,30 +59,6 @@ parameters:
Only pages matching these patterns will be crawled. Example: blog/*, about/* Only pages matching these patterns will be crawled. Example: blog/*, about/*
zh_Hans: 只有与这些模式匹配的页面才会被爬取。示例blog/*, about/* zh_Hans: 只有与这些模式匹配的页面才会被爬取。示例blog/*, about/*
form: form form: form
- name: excludes
type: string
label:
en_US: URL patterns to exclude
zh_Hans: 要排除的URL模式
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
human_description:
en_US: |
Pages matching these patterns will be skipped. Example: blog/*, about/*
zh_Hans: 匹配这些模式的页面将被跳过。示例blog/*, about/*
form: form
- name: returnOnlyUrls
type: boolean
default: false
label:
en_US: return Only Urls
zh_Hans: 仅返回URL
human_description:
en_US: |
If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.
zh_Hans: 只返回爬取到的网页链接,而不是网页内容本身。
form: form
- name: maxDepth - name: maxDepth
type: number type: number
label: label:
@ -80,27 +69,10 @@ parameters:
zh_Hans: 相对于输入的URL爬取的最大深度。maxDepth为0时仅抓取输入的URL。maxDepth为1时抓取输入的URL以及所有一级深层页面。maxDepth为2时抓取输入的URL以及所有两级深层页面。更高值遵循相同模式。 zh_Hans: 相对于输入的URL爬取的最大深度。maxDepth为0时仅抓取输入的URL。maxDepth为1时抓取输入的URL以及所有一级深层页面。maxDepth为2时抓取输入的URL以及所有两级深层页面。更高值遵循相同模式。
form: form form: form
min: 0 min: 0
- name: mode default: 2
type: select
required: false
form: form
options:
- value: default
label:
en_US: default
- value: fast
label:
en_US: fast
default: default
label:
en_US: Crawl Mode
zh_Hans: 爬取模式
human_description:
en_US: The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.
zh_Hans: 使用fast模式将不会使用其站点地图比普通模式快4倍但是可能不够准确也不适用于大量js渲染的网站。
- name: ignoreSitemap - name: ignoreSitemap
type: boolean type: boolean
default: false default: true
label: label:
en_US: ignore Sitemap en_US: ignore Sitemap
zh_Hans: 忽略站点地图 zh_Hans: 忽略站点地图
@ -120,7 +92,7 @@ parameters:
form: form form: form
min: 1 min: 1
default: 5 default: 5
- name: allowBackwardCrawling - name: allowBackwardLinks
type: boolean type: boolean
default: false default: false
label: label:
@ -130,7 +102,7 @@ parameters:
en_US: Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product' en_US: Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'
zh_Hans: 使爬虫能够从特定URL导航到之前链接的页面。例如从'example.com/product/123'返回到'example.com/product' zh_Hans: 使爬虫能够从特定URL导航到之前链接的页面。例如从'example.com/product/123'返回到'example.com/product'
form: form form: form
- name: allowExternalContentLinks - name: allowExternalLinks
type: boolean type: boolean
default: false default: false
label: label:
@ -140,7 +112,30 @@ parameters:
en_US: Allows the crawler to follow links to external websites. en_US: Allows the crawler to follow links to external websites.
zh_Hans: zh_Hans:
form: form form: form
############## Page Options ####################### - name: webhook
type: string
label:
en_US: webhook
human_description:
en_US: |
The URL to send the webhook to. This will trigger for crawl started (crawl.started) ,every page crawled (crawl.page) and when the crawl is completed (crawl.completed or crawl.failed). The response will be the same as the /scrape endpoint.
zh_Hans: 发送Webhook的URL。这将在开始爬取crawl.started、每爬取一个页面crawl.page以及爬取完成crawl.completed或crawl.failed时触发。响应将与/scrape端点相同。
form: form
############## Scrape Options #######################
- name: formats
type: string
label:
en_US: Formats
zh_Hans: 结果的格式
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
human_description:
en_US: |
Formats to include in the output. Available options: markdown, html, rawHtml, links, screenshot
zh_Hans: |
输出中应包含的格式。可以填入: markdown, html, rawHtml, links, screenshot
form: form
- name: headers - name: headers
type: string type: string
label: label:
@ -155,30 +150,10 @@ parameters:
en_US: Please enter an object that can be serialized in JSON en_US: Please enter an object that can be serialized in JSON
zh_Hans: 请输入可以json序列化的对象 zh_Hans: 请输入可以json序列化的对象
form: form form: form
- name: includeHtml - name: includeTags
type: boolean
default: false
label:
en_US: include Html
zh_Hans: 包含HTML
human_description:
en_US: Include the HTML version of the content on page. Will output a html key in the response.
zh_Hans: 返回中包含一个HTML版本的内容将以html键返回。
form: form
- name: includeRawHtml
type: boolean
default: false
label:
en_US: include Raw Html
zh_Hans: 包含原始HTML
human_description:
en_US: Include the raw HTML content of the page. Will output a rawHtml key in the response.
zh_Hans: 返回中包含一个原始HTML版本的内容将以rawHtml键返回。
form: form
- name: onlyIncludeTags
type: string type: string
label: label:
en_US: only Include Tags en_US: Include Tags
zh_Hans: 仅抓取这些标签 zh_Hans: 仅抓取这些标签
placeholder: placeholder:
en_US: Use commas to separate multiple tags en_US: Use commas to separate multiple tags
@ -189,6 +164,20 @@ parameters:
zh_Hans: | zh_Hans: |
仅在最终输出中包含HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer 仅在最终输出中包含HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
form: form form: form
- name: excludeTags
type: string
label:
en_US: Exclude Tags
zh_Hans: 要移除这些标签
human_description:
en_US: |
Tags, classes and ids to remove from the page. Use comma separated values. Example: script, .ad, #footer
zh_Hans: |
要在最终输出中移除HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
form: form
- name: onlyMainContent - name: onlyMainContent
type: boolean type: boolean
default: false default: false
@ -199,40 +188,6 @@ parameters:
en_US: Only return the main content of the page excluding headers, navs, footers, etc. en_US: Only return the main content of the page excluding headers, navs, footers, etc.
zh_Hans: 只返回页面的主要内容,不包括头部、导航栏、尾部等。 zh_Hans: 只返回页面的主要内容,不包括头部、导航栏、尾部等。
form: form form: form
- name: removeTags
type: string
label:
en_US: remove Tags
zh_Hans: 要移除这些标签
human_description:
en_US: |
Tags, classes and ids to remove from the page. Use comma separated values. Example: script, .ad, #footer
zh_Hans: |
要在最终输出中移除HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
form: form
- name: replaceAllPathsWithAbsolutePaths
type: boolean
default: false
label:
en_US: All AbsolutePaths
zh_Hans: 使用绝对路径
human_description:
en_US: Replace all relative paths with absolute paths for images and links.
zh_Hans: 将所有图片和链接的相对路径替换为绝对路径。
form: form
- name: screenshot
type: boolean
default: false
label:
en_US: screenshot
zh_Hans: 截图
human_description:
en_US: Include a screenshot of the top of the page that you are scraping.
zh_Hans: 提供正在抓取的页面的顶部的截图。
form: form
- name: waitFor - name: waitFor
type: number type: number
min: 0 min: 0

View File

@ -0,0 +1,25 @@
from typing import Any
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
from core.tools.tool.builtin_tool import BuiltinTool
class MapTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
"""
the api doc:
https://docs.firecrawl.dev/api-reference/endpoint/map
"""
app = FirecrawlApp(
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
)
payload = {}
payload["search"] = tool_parameters.get("search")
payload["ignoreSitemap"] = tool_parameters.get("ignoreSitemap", True)
payload["includeSubdomains"] = tool_parameters.get("includeSubdomains", False)
payload["limit"] = tool_parameters.get("limit", 5000)
map_result = app.map(url=tool_parameters["url"], **payload)
return self.create_json_message(map_result)

View File

@ -0,0 +1,59 @@
identity:
name: map
author: hjlarry
label:
en_US: Map
zh_Hans: 地图式快爬
description:
human:
en_US: Input a website and get all the urls on the website - extremly fast
zh_Hans: 输入一个网站,快速获取网站上的所有网址。
llm: Input a website and get all the urls on the website - extremly fast
parameters:
- name: url
type: string
required: true
label:
en_US: Start URL
zh_Hans: 起始URL
human_description:
en_US: The base URL to start crawling from.
zh_Hans: 要爬取网站的起始URL。
llm_description: The URL of the website that needs to be crawled. This is a required parameter.
form: llm
- name: search
type: string
label:
en_US: search
zh_Hans: 搜索查询
human_description:
en_US: Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 100 search results. However, if map finds more results, there is no limit applied.
zh_Hans: 用于映射的搜索查询。在Alpha阶段搜索功能的“智能”部分限制为最多100个搜索结果。然而如果地图找到了更多结果则不施加任何限制。
llm_description: Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 100 search results. However, if map finds more results, there is no limit applied.
form: llm
############## Page Options #######################
- name: ignoreSitemap
type: boolean
default: true
label:
en_US: ignore Sitemap
zh_Hans: 忽略站点地图
human_description:
en_US: Ignore the website sitemap when crawling.
zh_Hans: 爬取时忽略网站站点地图。
form: form
- name: includeSubdomains
type: boolean
default: false
label:
en_US: include Subdomains
zh_Hans: 包含子域名
form: form
- name: limit
type: number
min: 0
default: 5000
label:
en_US: Maximum results
zh_Hans: 最大结果数量
form: form

View File

@ -6,34 +6,34 @@ from core.tools.tool.builtin_tool import BuiltinTool
class ScrapeTool(BuiltinTool): class ScrapeTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage: def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> list[ToolInvokeMessage]:
""" """
the pageOptions and extractorOptions comes from doc here: the api doc:
https://docs.firecrawl.dev/api-reference/endpoint/scrape https://docs.firecrawl.dev/api-reference/endpoint/scrape
""" """
app = FirecrawlApp( app = FirecrawlApp(
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"] api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
) )
pageOptions = {} payload = {}
extractorOptions = {} extract = {}
pageOptions["headers"] = get_json_params(tool_parameters, "headers") payload["formats"] = get_array_params(tool_parameters, "formats")
pageOptions["includeHtml"] = tool_parameters.get("includeHtml", False) payload["onlyMainContent"] = tool_parameters.get("onlyMainContent", True)
pageOptions["includeRawHtml"] = tool_parameters.get("includeRawHtml", False) payload["includeTags"] = get_array_params(tool_parameters, "includeTags")
pageOptions["onlyIncludeTags"] = get_array_params(tool_parameters, "onlyIncludeTags") payload["excludeTags"] = get_array_params(tool_parameters, "excludeTags")
pageOptions["removeTags"] = get_array_params(tool_parameters, "removeTags") payload["headers"] = get_json_params(tool_parameters, "headers")
pageOptions["onlyMainContent"] = tool_parameters.get("onlyMainContent", False) payload["waitFor"] = tool_parameters.get("waitFor", 0)
pageOptions["replaceAllPathsWithAbsolutePaths"] = tool_parameters.get("replaceAllPathsWithAbsolutePaths", False) payload["timeout"] = tool_parameters.get("timeout", 30000)
pageOptions["screenshot"] = tool_parameters.get("screenshot", False)
pageOptions["waitFor"] = tool_parameters.get("waitFor", 0)
extractorOptions["mode"] = tool_parameters.get("mode", "") extract["schema"] = get_json_params(tool_parameters, "schema")
extractorOptions["extractionPrompt"] = tool_parameters.get("extractionPrompt", "") extract["systemPrompt"] = tool_parameters.get("systemPrompt")
extractorOptions["extractionSchema"] = get_json_params(tool_parameters, "extractionSchema") extract["prompt"] = tool_parameters.get("prompt")
extract = {k: v for k, v in extract.items() if v not in (None, "")}
payload["extract"] = extract or None
crawl_result = app.scrape_url( payload = {k: v for k, v in payload.items() if v not in (None, "")}
url=tool_parameters["url"], pageOptions=pageOptions, extractorOptions=extractorOptions
)
return self.create_json_message(crawl_result) crawl_result = app.scrape_url(url=tool_parameters["url"], **payload)
markdown_result = crawl_result.get("data", {}).get("markdown", "")
return [self.create_text_message(markdown_result), self.create_json_message(crawl_result)]

View File

@ -6,8 +6,8 @@ identity:
zh_Hans: 单页面抓取 zh_Hans: 单页面抓取
description: description:
human: human:
en_US: Extract data from a single URL. en_US: Turn any url into clean data.
zh_Hans: 从单个URL抓取数据。 zh_Hans: 将任何网址转换为干净的数据。
llm: This tool is designed to scrape URL and output the content in Markdown format. llm: This tool is designed to scrape URL and output the content in Markdown format.
parameters: parameters:
- name: url - name: url
@ -21,7 +21,59 @@ parameters:
zh_Hans: 要抓取并提取数据的网站URL。 zh_Hans: 要抓取并提取数据的网站URL。
llm_description: The URL of the website that needs to be crawled. This is a required parameter. llm_description: The URL of the website that needs to be crawled. This is a required parameter.
form: llm form: llm
############## Page Options ####################### ############## Payload #######################
- name: formats
type: string
label:
en_US: Formats
zh_Hans: 结果的格式
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
human_description:
en_US: |
Formats to include in the output. Available options: markdown, html, rawHtml, links, screenshot, extract, screenshot@fullPage
zh_Hans: |
输出中应包含的格式。可以填入: markdown, html, rawHtml, links, screenshot, extract, screenshot@fullPage
form: form
- name: onlyMainContent
type: boolean
default: false
label:
en_US: only Main Content
zh_Hans: 仅抓取主要内容
human_description:
en_US: Only return the main content of the page excluding headers, navs, footers, etc.
zh_Hans: 只返回页面的主要内容,不包括头部、导航栏、尾部等。
form: form
- name: includeTags
type: string
label:
en_US: Include Tags
zh_Hans: 仅抓取这些标签
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
human_description:
en_US: |
Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: script, .ad, #footer
zh_Hans: |
仅在最终输出中包含HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
form: form
- name: excludeTags
type: string
label:
en_US: Exclude Tags
zh_Hans: 要移除这些标签
human_description:
en_US: |
Tags, classes and ids to remove from the page. Use comma separated values. Example: script, .ad, #footer
zh_Hans: |
要在最终输出中移除HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
form: form
- name: headers - name: headers
type: string type: string
label: label:
@ -36,87 +88,10 @@ parameters:
en_US: Please enter an object that can be serialized in JSON en_US: Please enter an object that can be serialized in JSON
zh_Hans: 请输入可以json序列化的对象 zh_Hans: 请输入可以json序列化的对象
form: form form: form
- name: includeHtml
type: boolean
default: false
label:
en_US: include Html
zh_Hans: 包含HTML
human_description:
en_US: Include the HTML version of the content on page. Will output a html key in the response.
zh_Hans: 返回中包含一个HTML版本的内容将以html键返回。
form: form
- name: includeRawHtml
type: boolean
default: false
label:
en_US: include Raw Html
zh_Hans: 包含原始HTML
human_description:
en_US: Include the raw HTML content of the page. Will output a rawHtml key in the response.
zh_Hans: 返回中包含一个原始HTML版本的内容将以rawHtml键返回。
form: form
- name: onlyIncludeTags
type: string
label:
en_US: only Include Tags
zh_Hans: 仅抓取这些标签
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
human_description:
en_US: |
Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: script, .ad, #footer
zh_Hans: |
仅在最终输出中包含HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
form: form
- name: onlyMainContent
type: boolean
default: false
label:
en_US: only Main Content
zh_Hans: 仅抓取主要内容
human_description:
en_US: Only return the main content of the page excluding headers, navs, footers, etc.
zh_Hans: 只返回页面的主要内容,不包括头部、导航栏、尾部等。
form: form
- name: removeTags
type: string
label:
en_US: remove Tags
zh_Hans: 要移除这些标签
human_description:
en_US: |
Tags, classes and ids to remove from the page. Use comma separated values. Example: script, .ad, #footer
zh_Hans: |
要在最终输出中移除HTML页面的这些标签可以通过标签名、类或ID来设定使用逗号分隔值。示例script, .ad, #footer
placeholder:
en_US: Use commas to separate multiple tags
zh_Hans: 多个标签时使用半角逗号分隔
form: form
- name: replaceAllPathsWithAbsolutePaths
type: boolean
default: false
label:
en_US: All AbsolutePaths
zh_Hans: 使用绝对路径
human_description:
en_US: Replace all relative paths with absolute paths for images and links.
zh_Hans: 将所有图片和链接的相对路径替换为绝对路径。
form: form
- name: screenshot
type: boolean
default: false
label:
en_US: screenshot
zh_Hans: 截图
human_description:
en_US: Include a screenshot of the top of the page that you are scraping.
zh_Hans: 提供正在抓取的页面的顶部的截图。
form: form
- name: waitFor - name: waitFor
type: number type: number
min: 0 min: 0
default: 0
label: label:
en_US: wait For en_US: wait For
zh_Hans: 等待时间 zh_Hans: 等待时间
@ -124,57 +99,54 @@ parameters:
en_US: Wait x amount of milliseconds for the page to load to fetch content. en_US: Wait x amount of milliseconds for the page to load to fetch content.
zh_Hans: 等待x毫秒以使页面加载并获取内容。 zh_Hans: 等待x毫秒以使页面加载并获取内容。
form: form form: form
- name: timeout
type: number
min: 0
default: 30000
label:
en_US: Timeout
human_description:
en_US: Timeout in milliseconds for the request.
zh_Hans: 请求的超时时间(以毫秒为单位)。
form: form
############## Extractor Options ####################### ############## Extractor Options #######################
- name: mode - name: schema
type: select
options:
- value: markdown
label:
en_US: markdown
- value: llm-extraction
label:
en_US: llm-extraction
- value: llm-extraction-from-raw-html
label:
en_US: llm-extraction-from-raw-html
- value: llm-extraction-from-markdown
label:
en_US: llm-extraction-from-markdown
label:
en_US: Extractor Mode
zh_Hans: 提取模式
human_description:
en_US: |
The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM.
zh_Hans: 使用的提取模式。“markdown”返回抓取的markdown内容不执行LLM提取。“llm-extractioin”使用LLM按Extractor Schema从内容中提取信息。
form: form
- name: extractionPrompt
type: string
label:
en_US: Extractor Prompt
zh_Hans: 提取时的提示词
human_description:
en_US: A prompt describing what information to extract from the page, applicable for LLM extraction modes.
zh_Hans: 当使用LLM提取模式时用于给LLM描述提取规则。
form: form
- name: extractionSchema
type: string type: string
label: label:
en_US: Extractor Schema en_US: Extractor Schema
zh_Hans: 提取时的结构 zh_Hans: 提取时的结构
placeholder: placeholder:
en_US: Please enter an object that can be serialized in JSON en_US: Please enter an object that can be serialized in JSON
zh_Hans: 请输入可以json序列化的对象
human_description: human_description:
en_US: | en_US: |
The schema for the data to be extracted, required only for LLM extraction modes. Example: { The schema for the data to be extracted. Example: {
"type": "object", "type": "object",
"properties": {"company_mission": {"type": "string"}}, "properties": {"company_mission": {"type": "string"}},
"required": ["company_mission"] "required": ["company_mission"]
} }
zh_Hans: | zh_Hans: |
当使用LLM提取模式时使用该结构去提取,示例:{ 使用该结构去提取,示例:{
"type": "object", "type": "object",
"properties": {"company_mission": {"type": "string"}}, "properties": {"company_mission": {"type": "string"}},
"required": ["company_mission"] "required": ["company_mission"]
} }
form: form form: form
- name: systemPrompt
type: string
label:
en_US: Extractor System Prompt
zh_Hans: 提取时的系统提示词
human_description:
en_US: The system prompt to use for the extraction.
zh_Hans: 用于提取的系统提示。
form: form
- name: prompt
type: string
label:
en_US: Extractor Prompt
zh_Hans: 提取时的提示词
human_description:
en_US: The prompt to use for the extraction without a schema.
zh_Hans: 用于无schema时提取的提示词
form: form

View File

@ -1,27 +0,0 @@
from typing import Any
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
from core.tools.tool.builtin_tool import BuiltinTool
class SearchTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
"""
the pageOptions and searchOptions comes from doc here:
https://docs.firecrawl.dev/api-reference/endpoint/search
"""
app = FirecrawlApp(
api_key=self.runtime.credentials["firecrawl_api_key"], base_url=self.runtime.credentials["base_url"]
)
pageOptions = {}
pageOptions["onlyMainContent"] = tool_parameters.get("onlyMainContent", False)
pageOptions["fetchPageContent"] = tool_parameters.get("fetchPageContent", True)
pageOptions["includeHtml"] = tool_parameters.get("includeHtml", False)
pageOptions["includeRawHtml"] = tool_parameters.get("includeRawHtml", False)
searchOptions = {"limit": tool_parameters.get("limit")}
search_result = app.search(
query=tool_parameters["keyword"], pageOptions=pageOptions, searchOptions=searchOptions
)
return self.create_json_message(search_result)

View File

@ -1,75 +0,0 @@
identity:
name: search
author: ahasasjeb
label:
en_US: Search
zh_Hans: 搜索
description:
human:
en_US: Search, and output in Markdown format
zh_Hans: 搜索并且以Markdown格式输出
llm: This tool can perform online searches and convert the results to Markdown format.
parameters:
- name: keyword
type: string
required: true
label:
en_US: keyword
zh_Hans: 关键词
human_description:
en_US: Input keywords to use Firecrawl API for search.
zh_Hans: 输入关键词即可使用Firecrawl API进行搜索。
llm_description: Efficiently extract keywords from user text.
form: llm
############## Page Options #######################
- name: onlyMainContent
type: boolean
default: false
label:
en_US: only Main Content
zh_Hans: 仅抓取主要内容
human_description:
en_US: Only return the main content of the page excluding headers, navs, footers, etc.
zh_Hans: 只返回页面的主要内容,不包括头部、导航栏、尾部等。
form: form
- name: fetchPageContent
type: boolean
default: true
label:
en_US: fetch Page Content
zh_Hans: 抓取页面内容
human_description:
en_US: Fetch the content of each page. If false, defaults to a basic fast serp API.
zh_Hans: 获取每个页面的内容。如果为否则使用基本的快速搜索结果页面API。
form: form
- name: includeHtml
type: boolean
default: false
label:
en_US: include Html
zh_Hans: 包含HTML
human_description:
en_US: Include the HTML version of the content on page. Will output a html key in the response.
zh_Hans: 返回中包含一个HTML版本的内容将以html键返回。
form: form
- name: includeRawHtml
type: boolean
default: false
label:
en_US: include Raw Html
zh_Hans: 包含原始HTML
human_description:
en_US: Include the raw HTML content of the page. Will output a rawHtml key in the response.
zh_Hans: 返回中包含一个原始HTML版本的内容将以rawHtml键返回。
form: form
############## Search Options #######################
- name: limit
type: number
min: 0
label:
en_US: Maximum results
zh_Hans: 最大结果数量
human_description:
en_US: Maximum number of results. Max is 20 during beta.
zh_Hans: 最大结果数量。在测试阶段最大为20。
form: form