firecrawl/apps/api/openapi.json

929 lines
32 KiB
JSON
Raw Normal View History

2024-04-18 13:23:10 +08:00
{
2024-04-22 23:41:54 +08:00
"openapi": "3.0.0",
"info": {
"title": "Firecrawl API",
"version": "1.0.0",
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
"contact": {
"name": "Firecrawl Support",
"url": "https://firecrawl.dev/support",
"email": "support@firecrawl.dev"
}
},
"servers": [
{
"url": "https://api.firecrawl.dev/v0"
}
],
"paths": {
"/scrape": {
"post": {
"summary": "Scrape a single URL",
"operationId": "scrape",
2024-04-22 23:41:54 +08:00
"tags": ["Scraping"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "The URL to scrape"
2024-04-18 13:23:10 +08:00
},
"formats": {
"type": "array",
"items": {
"type": "string",
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
},
"description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
"default": ["markdown"]
2024-05-16 03:11:16 +08:00
},
"headers": {
2024-05-16 03:11:16 +08:00
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"includeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"excludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": true
2024-05-16 03:11:16 +08:00
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for the request",
"default": 30000
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
2024-04-18 13:23:10 +08:00
}
2024-04-22 23:41:54 +08:00
},
"required": ["url"]
2024-04-18 13:23:10 +08:00
}
}
}
2024-04-22 23:41:54 +08:00
},
"responses": {
"200": {
"description": "Successful response",
2024-04-18 13:23:10 +08:00
"content": {
"application/json": {
"schema": {
2024-04-22 23:41:54 +08:00
"$ref": "#/components/schemas/ScrapeResponse"
2024-04-18 13:23:10 +08:00
}
}
}
},
2024-04-22 23:41:54 +08:00
"402": {
2024-07-18 21:34:03 +08:00
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
2024-04-22 23:41:54 +08:00
},
"429": {
2024-07-18 21:34:03 +08:00
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
2024-04-22 23:41:54 +08:00
},
"500": {
2024-07-18 21:34:03 +08:00
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
2024-04-18 13:23:10 +08:00
}
}
2024-04-22 23:41:54 +08:00
}
},
"/crawl": {
"post": {
"summary": "Crawl multiple URLs based on options",
"operationId": "crawlUrls",
"tags": ["Crawling"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
2024-04-18 13:23:10 +08:00
"schema": {
2024-04-22 23:41:54 +08:00
"type": "object",
"properties": {
"url": {
"type": "string",
"format": "uri",
"description": "The base URL to start crawling from"
},
"crawlerOptions": {
2024-04-18 13:23:10 +08:00
"type": "object",
"properties": {
2024-04-22 23:41:54 +08:00
"includes": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to include"
2024-04-18 13:23:10 +08:00
},
2024-04-22 23:41:54 +08:00
"excludes": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to exclude"
2024-04-18 13:23:10 +08:00
},
2024-04-22 23:41:54 +08:00
"generateImgAltText": {
"type": "boolean",
"description": "Generate alt text for images using LLMs (must have a paid plan)",
"default": false
2024-04-18 13:23:10 +08:00
},
2024-04-22 23:41:54 +08:00
"returnOnlyUrls": {
"type": "boolean",
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
"default": false
2024-04-18 13:23:10 +08:00
},
2024-05-16 03:11:16 +08:00
"maxDepth": {
"type": "integer",
2024-06-27 08:22:46 +08:00
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
2024-05-16 03:11:16 +08:00
},
"mode": {
"type": "string",
"enum": ["default", "fast"],
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
"default": "default"
},
2024-06-11 09:26:25 +08:00
"ignoreSitemap": {
"type": "boolean",
"description": "Ignore the website sitemap when crawling",
"default": false
},
2024-04-22 23:41:54 +08:00
"limit": {
2024-04-18 13:23:10 +08:00
"type": "integer",
"description": "Maximum number of pages to crawl",
"default": 10000
2024-06-13 21:51:05 +08:00
},
"allowBackwardCrawling": {
"type": "boolean",
2024-07-18 02:07:06 +08:00
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
"default": false
},
"allowExternalContentLinks": {
"type": "boolean",
"description": "Allows the crawler to follow links to external websites.",
2024-06-13 21:51:05 +08:00
"default": false
2024-04-22 23:41:54 +08:00
}
}
},
"pageOptions": {
"type": "object",
"properties": {
2024-07-18 02:07:06 +08:00
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
2024-05-16 03:11:16 +08:00
},
"includeHtml": {
"type": "boolean",
2024-07-18 03:43:22 +08:00
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
2024-05-16 03:11:16 +08:00
"default": false
2024-06-11 09:26:25 +08:00
},
2024-07-18 02:07:06 +08:00
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyMainContent": {
2024-06-11 09:26:25 +08:00
"type": "boolean",
2024-07-18 02:07:06 +08:00
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
2024-06-11 09:26:25 +08:00
"default": false
},
2024-06-13 21:51:05 +08:00
"removeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
2024-07-18 02:07:06 +08:00
},
"screenshot": {
"type": "boolean",
"description": "Include a screenshot of the top of the page that you are scraping.",
"default": false
},
"fullPageScreenshot": {
"type": "boolean",
"description": "Include a full page screenshot of the page that you are scraping.",
"default": false
2024-07-18 02:07:06 +08:00
},
"waitFor": {
"type": "integer",
"description": "Wait x amount of milliseconds for the page to load to fetch content",
"default": 0
2024-04-18 13:23:10 +08:00
}
}
}
2024-04-22 23:41:54 +08:00
},
"required": ["url"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CrawlResponse"
2024-04-18 13:23:10 +08:00
}
}
}
2024-04-22 23:41:54 +08:00
},
"402": {
2024-07-18 21:34:03 +08:00
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
2024-04-22 23:41:54 +08:00
},
"429": {
2024-07-18 21:34:03 +08:00
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
2024-04-22 23:41:54 +08:00
},
"500": {
2024-07-18 21:34:03 +08:00
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
2024-04-18 13:23:10 +08:00
}
}
}
},
2024-04-25 01:11:44 +08:00
"/search": {
"post": {
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
"operationId": "searchGoogle",
"tags": ["Search"],
"security": [
{
"bearerAuth": []
}
],
"requestBody": {
"required": true,
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"query": {
"type": "string",
"format": "uri",
2024-05-17 02:03:32 +08:00
"description": "The query to search for"
2024-04-25 01:11:44 +08:00
},
"pageOptions": {
"type": "object",
"properties": {
"onlyMainContent": {
"type": "boolean",
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
"default": false
},
"fetchPageContent": {
"type": "boolean",
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
"default": true
2024-05-16 03:11:16 +08:00
},
"includeHtml": {
"type": "boolean",
2024-07-18 03:43:22 +08:00
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
"default": false
},
"includeRawHtml": {
"type": "boolean",
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
2024-05-16 03:11:16 +08:00
"default": false
2024-04-25 01:11:44 +08:00
}
}
},
"searchOptions": {
"type": "object",
"properties": {
"limit": {
"type": "integer",
"description": "Maximum number of results. Max is 20 during beta."
}
}
}
},
"required": ["query"]
}
}
}
},
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SearchResponse"
}
}
}
},
"402": {
2024-07-18 21:34:03 +08:00
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
2024-04-25 01:11:44 +08:00
},
"429": {
2024-07-18 21:34:03 +08:00
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
2024-04-25 01:11:44 +08:00
},
"500": {
2024-07-18 21:34:03 +08:00
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
2024-04-25 01:11:44 +08:00
}
}
}
},
2024-04-22 23:41:54 +08:00
"/crawl/status/{jobId}": {
"get": {
"tags": ["Crawl"],
"summary": "Get the status of a crawl job",
"operationId": "getCrawlStatus",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
2024-04-18 13:23:10 +08:00
"type": "object",
"properties": {
2024-04-22 23:41:54 +08:00
"status": {
"type": "string",
"description": "Status of the job (completed, active, failed, paused)"
2024-04-18 13:23:10 +08:00
},
2024-04-22 23:41:54 +08:00
"current": {
"type": "integer",
"description": "Current page number"
2024-04-18 13:23:10 +08:00
},
2024-04-22 23:41:54 +08:00
"total": {
"type": "integer",
"description": "Total number of pages"
},
"data": {
"type": "array",
"items": {
2024-05-17 02:03:32 +08:00
"$ref": "#/components/schemas/CrawlStatusResponseObj"
2024-04-22 23:41:54 +08:00
},
"description": "Data returned from the job (null when it is in progress)"
2024-05-16 03:11:16 +08:00
},
"partial_data": {
"type": "array",
"items": {
2024-05-17 02:03:32 +08:00
"$ref": "#/components/schemas/CrawlStatusResponseObj"
2024-05-16 03:11:16 +08:00
},
2024-07-18 03:43:22 +08:00
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
2024-05-16 03:11:16 +08:00
}
}
}
}
}
},
"402": {
2024-07-18 21:34:03 +08:00
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
2024-05-16 03:11:16 +08:00
},
"429": {
2024-07-18 21:34:03 +08:00
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
2024-05-16 03:11:16 +08:00
},
"500": {
2024-07-18 21:34:03 +08:00
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
2024-05-16 03:11:16 +08:00
}
}
}
},
"/crawl/cancel/{jobId}": {
"delete": {
"tags": ["Crawl"],
"summary": "Cancel a crawl job",
"operationId": "cancelCrawlJob",
"security": [
{
"bearerAuth": []
}
],
"parameters": [
{
"name": "jobId",
"in": "path",
"description": "ID of the crawl job",
"required": true,
"schema": {
"type": "string"
}
}
],
"responses": {
"200": {
"description": "Successful response",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"status": {
"type": "string",
"description": "Returns cancelled."
2024-04-18 13:23:10 +08:00
}
}
}
}
}
2024-04-22 23:41:54 +08:00
},
"402": {
2024-07-18 21:34:03 +08:00
"description": "Payment required",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Payment required to access this resource."
}
}
}
}
}
2024-04-22 23:41:54 +08:00
},
"429": {
2024-07-18 21:34:03 +08:00
"description": "Too many requests",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "Request rate limit exceeded. Please wait and try again later."
}
}
}
}
}
2024-04-22 23:41:54 +08:00
},
"500": {
2024-07-18 21:34:03 +08:00
"description": "Server error",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"error": {
"type": "string",
"example": "An unexpected error occurred on the server."
}
}
}
}
}
2024-04-18 13:23:10 +08:00
}
}
}
2024-04-22 23:41:54 +08:00
}
},
"components": {
"securitySchemes": {
"bearerAuth": {
"type": "http",
"scheme": "bearer"
}
2024-04-18 13:23:10 +08:00
},
2024-04-22 23:41:54 +08:00
"schemas": {
"ScrapeResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"warning": {
"type": "string",
"nullable": true,
"description": "Warning message to let you know of any issues."
},
2024-04-22 23:41:54 +08:00
"data": {
"type": "object",
"properties": {
2024-04-25 01:11:44 +08:00
"markdown": {
"type": "string",
"nullable": true,
"description": "Markdown content of the page if the `markdown` format was specified (default)"
2024-04-22 23:41:54 +08:00
},
2024-05-16 03:11:16 +08:00
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if the `html` format was specified"
2024-07-18 03:43:22 +08:00
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
2024-05-16 03:11:16 +08:00
},
2024-04-25 01:11:44 +08:00
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
2024-06-27 08:22:46 +08:00
"<any other metadata> ": {
"type": "string"
2024-06-27 08:25:38 +08:00
},
"statusCode": {
2024-06-27 08:25:38 +08:00
"type": "integer",
"description": "The status code of the page"
},
"error": {
2024-06-27 08:25:38 +08:00
"type": "string",
"nullable": true,
"description": "The error message of the page"
2024-04-25 01:11:44 +08:00
}
}
}
}
}
}
},
2024-05-17 02:03:32 +08:00
"CrawlStatusResponseObj": {
"type": "object",
"properties": {
"markdown": {
"type": "string",
"nullable": true,
"description": "Markdown content of the page if the `markdown` format was specified (default)"
2024-05-17 02:03:32 +08:00
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if the `html` format was specified"
2024-07-18 03:43:22 +08:00
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
2024-05-17 02:03:32 +08:00
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
2024-06-11 09:26:25 +08:00
},
2024-05-17 02:03:32 +08:00
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
2024-06-27 08:25:38 +08:00
},
"<any other metadata> ": {
"type": "string"
},
"statusCode": {
2024-06-27 08:25:38 +08:00
"type": "integer",
"description": "The status code of the page"
},
"error": {
2024-06-27 08:25:38 +08:00
"type": "string",
"nullable": true,
"description": "The error message of the page"
2024-05-17 02:03:32 +08:00
}
}
}
}
},
2024-04-25 01:11:44 +08:00
"SearchResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"data": {
2024-04-26 04:28:07 +08:00
"type": "array",
"items": {
"markdown": {
"type": "string",
"nullable": true,
"description": "Markdown content of the page if the `markdown` format was specified (default)"
},
"html": {
"type": "string",
"nullable": true,
"description": "HTML version of the content on page if the `html` format was specified"
},
"rawHtml": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
},
"links": {
"type": "array",
"items": {
"type": "string",
"format": "uri"
2024-04-26 04:28:07 +08:00
},
"nullable": true,
"description": "Links on the page if the `links` format was specified"
},
"screenshot": {
"type": "string",
"nullable": true,
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
},
"metadata": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"description": {
"type": "string"
},
"language": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"statusCode": {
"type": "integer",
"description": "The status code of the page"
},
"error": {
"type": "string",
"nullable": true,
"description": "The error message of the page"
2024-04-22 23:41:54 +08:00
}
}
}
}
}
}
},
"CrawlResponse": {
"type": "object",
"properties": {
"success": {
"type": "boolean"
},
"id": {
2024-04-22 23:41:54 +08:00
"type": "string"
},
"url": {
"type": "string",
"format": "uri"
2024-04-22 23:41:54 +08:00
}
}
2024-04-18 13:23:10 +08:00
}
2024-04-22 23:41:54 +08:00
}
},
"security": [
{
"bearerAuth": []
}
]
2024-06-27 08:22:46 +08:00
}