2024-04-18 13:23:10 +08:00
{
2024-04-22 23:41:54 +08:00
"openapi" : "3.0.0" ,
"info" : {
"title" : "Firecrawl API" ,
"version" : "1.0.0" ,
"description" : "API for interacting with Firecrawl services to perform web scraping and crawling tasks." ,
"contact" : {
"name" : "Firecrawl Support" ,
"url" : "https://firecrawl.dev/support" ,
"email" : "support@firecrawl.dev"
}
} ,
"servers" : [
{
"url" : "https://api.firecrawl.dev/v0"
}
] ,
"paths" : {
"/scrape" : {
"post" : {
2024-05-16 03:11:16 +08:00
"summary" : "Scrape a single URL and optionally extract information using an LLM" ,
"operationId" : "scrapeAndExtractFromUrl" ,
2024-04-22 23:41:54 +08:00
"tags" : [ "Scraping" ] ,
"security" : [
{
"bearerAuth" : [ ]
}
] ,
"requestBody" : {
"required" : true ,
"content" : {
"application/json" : {
"schema" : {
"type" : "object" ,
"properties" : {
"url" : {
"type" : "string" ,
"format" : "uri" ,
"description" : "The URL to scrape"
2024-04-18 13:23:10 +08:00
} ,
2024-04-22 23:41:54 +08:00
"pageOptions" : {
"type" : "object" ,
"properties" : {
"onlyMainContent" : {
"type" : "boolean" ,
"description" : "Only return the main content of the page excluding headers, navs, footers, etc." ,
"default" : false
2024-05-16 03:11:16 +08:00
} ,
"includeHtml" : {
"type" : "boolean" ,
"description" : "Include the raw HTML content of the page. Will output a html key in the response." ,
"default" : false
2024-05-29 03:56:24 +08:00
} ,
2024-06-11 09:26:25 +08:00
"screenshot" : {
"type" : "boolean" ,
"description" : "Include a screenshot of the top of the page that you are scraping." ,
"default" : false
} ,
2024-05-29 03:56:24 +08:00
"waitFor" : {
"type" : "integer" ,
"description" : "Wait x amount of milliseconds for the page to load to fetch content" ,
"default" : 0
2024-06-11 09:26:25 +08:00
} ,
2024-06-13 21:51:05 +08:00
"removeTags" : {
"type" : "array" ,
"items" : {
"type" : "string"
} ,
"description" : "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
} ,
2024-06-27 08:22:46 +08:00
"onlyIncludeTags" : {
"type" : "array" ,
"items" : {
"type" : "string"
} ,
"description" : "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
} ,
2024-06-11 09:26:25 +08:00
"headers" : {
"type" : "object" ,
"description" : "Headers to send with the request. Can be used to send cookies, user-agent, etc."
2024-06-27 08:22:46 +08:00
} ,
"replaceAllPathsWithAbsolutePaths" : {
"type" : "boolean" ,
"description" : "Replace all relative paths with absolute paths for images and links" ,
"default" : false
2024-05-16 03:11:16 +08:00
}
}
} ,
"extractorOptions" : {
"type" : "object" ,
"description" : "Options for LLM-based extraction of structured information from the page content" ,
"properties" : {
"mode" : {
"type" : "string" ,
2024-07-03 02:05:42 +08:00
"enum" : [ "llm-extraction" , "llm-extraction-from-raw-html" ] ,
"description" : "The extraction mode to use. llm-extraction: Extracts information from the cleaned and parsed content. llm-extraction-from-raw-html: Extracts information directly from the raw HTML."
2024-05-16 03:11:16 +08:00
} ,
"extractionPrompt" : {
"type" : "string" ,
"description" : "A prompt describing what information to extract from the page"
} ,
"extractionSchema" : {
"type" : "object" ,
"additionalProperties" : true ,
"description" : "The schema for the data to be extracted" ,
"required" : [
"company_mission" ,
"supports_sso" ,
"is_open_source"
]
2024-04-22 23:41:54 +08:00
}
}
2024-05-16 03:11:16 +08:00
} ,
"timeout" : {
"type" : "integer" ,
"description" : "Timeout in milliseconds for the request" ,
"default" : 30000
2024-04-18 13:23:10 +08:00
}
2024-04-22 23:41:54 +08:00
} ,
"required" : [ "url" ]
2024-04-18 13:23:10 +08:00
}
}
}
2024-04-22 23:41:54 +08:00
} ,
"responses" : {
"200" : {
"description" : "Successful response" ,
2024-04-18 13:23:10 +08:00
"content" : {
"application/json" : {
"schema" : {
2024-04-22 23:41:54 +08:00
"$ref" : "#/components/schemas/ScrapeResponse"
2024-04-18 13:23:10 +08:00
}
}
}
} ,
2024-04-22 23:41:54 +08:00
"402" : {
"description" : "Payment required"
} ,
"429" : {
"description" : "Too many requests"
} ,
"500" : {
"description" : "Server error"
2024-04-18 13:23:10 +08:00
}
}
2024-04-22 23:41:54 +08:00
}
} ,
"/crawl" : {
"post" : {
"summary" : "Crawl multiple URLs based on options" ,
"operationId" : "crawlUrls" ,
"tags" : [ "Crawling" ] ,
"security" : [
{
"bearerAuth" : [ ]
}
] ,
"requestBody" : {
"required" : true ,
"content" : {
"application/json" : {
2024-04-18 13:23:10 +08:00
"schema" : {
2024-04-22 23:41:54 +08:00
"type" : "object" ,
"properties" : {
"url" : {
"type" : "string" ,
"format" : "uri" ,
"description" : "The base URL to start crawling from"
} ,
"crawlerOptions" : {
2024-04-18 13:23:10 +08:00
"type" : "object" ,
"properties" : {
2024-04-22 23:41:54 +08:00
"includes" : {
"type" : "array" ,
"items" : {
"type" : "string"
} ,
"description" : "URL patterns to include"
2024-04-18 13:23:10 +08:00
} ,
2024-04-22 23:41:54 +08:00
"excludes" : {
"type" : "array" ,
"items" : {
"type" : "string"
} ,
"description" : "URL patterns to exclude"
2024-04-18 13:23:10 +08:00
} ,
2024-04-22 23:41:54 +08:00
"generateImgAltText" : {
"type" : "boolean" ,
"description" : "Generate alt text for images using LLMs (must have a paid plan)" ,
"default" : false
2024-04-18 13:23:10 +08:00
} ,
2024-04-22 23:41:54 +08:00
"returnOnlyUrls" : {
"type" : "boolean" ,
"description" : "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents." ,
"default" : false
2024-04-18 13:23:10 +08:00
} ,
2024-05-16 03:11:16 +08:00
"maxDepth" : {
"type" : "integer" ,
2024-06-27 08:22:46 +08:00
"description" : "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
2024-05-16 03:11:16 +08:00
} ,
"mode" : {
"type" : "string" ,
"enum" : [ "default" , "fast" ] ,
"description" : "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites." ,
"default" : "default"
} ,
2024-06-11 09:26:25 +08:00
"ignoreSitemap" : {
"type" : "boolean" ,
"description" : "Ignore the website sitemap when crawling" ,
"default" : false
} ,
2024-04-22 23:41:54 +08:00
"limit" : {
2024-04-18 13:23:10 +08:00
"type" : "integer" ,
2024-05-10 22:59:33 +08:00
"description" : "Maximum number of pages to crawl" ,
"default" : 10000
2024-06-13 21:51:05 +08:00
} ,
"allowBackwardCrawling" : {
"type" : "boolean" ,
"description" : "Allow backward crawling (crawl from the base URL to the previous URLs)" ,
"default" : false
2024-04-22 23:41:54 +08:00
}
}
} ,
"pageOptions" : {
"type" : "object" ,
"properties" : {
"onlyMainContent" : {
"type" : "boolean" ,
"description" : "Only return the main content of the page excluding headers, navs, footers, etc." ,
"default" : false
2024-05-16 03:11:16 +08:00
} ,
"includeHtml" : {
"type" : "boolean" ,
"description" : "Include the raw HTML content of the page. Will output a html key in the response." ,
"default" : false
2024-06-11 09:26:25 +08:00
} ,
"screenshot" : {
"type" : "boolean" ,
"description" : "Include a screenshot of the top of the page that you are scraping." ,
"default" : false
} ,
"headers" : {
"type" : "object" ,
"description" : "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
2024-06-12 03:43:16 +08:00
} ,
2024-06-13 21:51:05 +08:00
"removeTags" : {
"type" : "array" ,
"items" : {
"type" : "string"
} ,
"description" : "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
} ,
2024-06-12 03:43:16 +08:00
"replaceAllPathsWithAbsolutePaths" : {
"type" : "boolean" ,
"description" : "Replace all relative paths with absolute paths for images and links" ,
"default" : false
2024-04-18 13:23:10 +08:00
}
}
}
2024-04-22 23:41:54 +08:00
} ,
"required" : [ "url" ]
}
}
}
} ,
"responses" : {
"200" : {
"description" : "Successful response" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/CrawlResponse"
2024-04-18 13:23:10 +08:00
}
}
}
2024-04-22 23:41:54 +08:00
} ,
"402" : {
"description" : "Payment required"
} ,
"429" : {
"description" : "Too many requests"
} ,
"500" : {
"description" : "Server error"
2024-04-18 13:23:10 +08:00
}
}
}
} ,
2024-04-25 01:11:44 +08:00
"/search" : {
"post" : {
"summary" : "Search for a keyword in Google, returns top page results with markdown content for each page" ,
"operationId" : "searchGoogle" ,
"tags" : [ "Search" ] ,
"security" : [
{
"bearerAuth" : [ ]
}
] ,
"requestBody" : {
"required" : true ,
"content" : {
"application/json" : {
"schema" : {
"type" : "object" ,
"properties" : {
"query" : {
"type" : "string" ,
"format" : "uri" ,
2024-05-17 02:03:32 +08:00
"description" : "The query to search for"
2024-04-25 01:11:44 +08:00
} ,
"pageOptions" : {
"type" : "object" ,
"properties" : {
"onlyMainContent" : {
"type" : "boolean" ,
"description" : "Only return the main content of the page excluding headers, navs, footers, etc." ,
"default" : false
} ,
"fetchPageContent" : {
"type" : "boolean" ,
"description" : "Fetch the content of each page. If false, defaults to a basic fast serp API." ,
"default" : true
2024-05-16 03:11:16 +08:00
} ,
"includeHtml" : {
"type" : "boolean" ,
"description" : "Include the raw HTML content of the page. Will output a html key in the response." ,
"default" : false
2024-04-25 01:11:44 +08:00
}
}
} ,
"searchOptions" : {
"type" : "object" ,
"properties" : {
"limit" : {
"type" : "integer" ,
"description" : "Maximum number of results. Max is 20 during beta."
}
}
}
} ,
"required" : [ "query" ]
}
}
}
} ,
"responses" : {
"200" : {
"description" : "Successful response" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/SearchResponse"
}
}
}
} ,
"402" : {
"description" : "Payment required"
} ,
"429" : {
"description" : "Too many requests"
} ,
"500" : {
"description" : "Server error"
}
}
}
} ,
2024-04-22 23:41:54 +08:00
"/crawl/status/{jobId}" : {
"get" : {
"tags" : [ "Crawl" ] ,
"summary" : "Get the status of a crawl job" ,
"operationId" : "getCrawlStatus" ,
"security" : [
{
"bearerAuth" : [ ]
}
] ,
"parameters" : [
{
"name" : "jobId" ,
"in" : "path" ,
"description" : "ID of the crawl job" ,
"required" : true ,
"schema" : {
"type" : "string"
}
}
] ,
"responses" : {
"200" : {
"description" : "Successful response" ,
"content" : {
"application/json" : {
"schema" : {
2024-04-18 13:23:10 +08:00
"type" : "object" ,
"properties" : {
2024-04-22 23:41:54 +08:00
"status" : {
"type" : "string" ,
"description" : "Status of the job (completed, active, failed, paused)"
2024-04-18 13:23:10 +08:00
} ,
2024-04-22 23:41:54 +08:00
"current" : {
"type" : "integer" ,
"description" : "Current page number"
2024-04-18 13:23:10 +08:00
} ,
2024-04-22 23:41:54 +08:00
"current_url" : {
2024-04-18 13:23:10 +08:00
"type" : "string" ,
2024-04-22 23:41:54 +08:00
"description" : "Current URL being scraped"
2024-04-18 13:23:10 +08:00
} ,
2024-04-22 23:41:54 +08:00
"current_step" : {
2024-04-18 13:23:10 +08:00
"type" : "string" ,
2024-04-22 23:41:54 +08:00
"description" : "Current step in the process"
} ,
"total" : {
"type" : "integer" ,
"description" : "Total number of pages"
} ,
"data" : {
"type" : "array" ,
"items" : {
2024-05-17 02:03:32 +08:00
"$ref" : "#/components/schemas/CrawlStatusResponseObj"
2024-04-22 23:41:54 +08:00
} ,
"description" : "Data returned from the job (null when it is in progress)"
2024-05-16 03:11:16 +08:00
} ,
"partial_data" : {
"type" : "array" ,
"items" : {
2024-05-17 02:03:32 +08:00
"$ref" : "#/components/schemas/CrawlStatusResponseObj"
2024-05-16 03:11:16 +08:00
} ,
2024-06-11 09:26:25 +08:00
"description" : "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
2024-05-16 03:11:16 +08:00
}
}
}
}
}
} ,
"402" : {
"description" : "Payment required"
} ,
"429" : {
"description" : "Too many requests"
} ,
"500" : {
"description" : "Server error"
}
}
}
} ,
"/crawl/cancel/{jobId}" : {
"delete" : {
"tags" : [ "Crawl" ] ,
"summary" : "Cancel a crawl job" ,
"operationId" : "cancelCrawlJob" ,
"security" : [
{
"bearerAuth" : [ ]
}
] ,
"parameters" : [
{
"name" : "jobId" ,
"in" : "path" ,
"description" : "ID of the crawl job" ,
"required" : true ,
"schema" : {
"type" : "string"
}
}
] ,
"responses" : {
"200" : {
"description" : "Successful response" ,
"content" : {
"application/json" : {
"schema" : {
"type" : "object" ,
"properties" : {
"status" : {
"type" : "string" ,
"description" : "Returns cancelled."
2024-04-18 13:23:10 +08:00
}
}
}
}
}
2024-04-22 23:41:54 +08:00
} ,
"402" : {
"description" : "Payment required"
} ,
"429" : {
"description" : "Too many requests"
} ,
"500" : {
"description" : "Server error"
2024-04-18 13:23:10 +08:00
}
}
}
2024-04-22 23:41:54 +08:00
}
} ,
"components" : {
"securitySchemes" : {
"bearerAuth" : {
"type" : "http" ,
"scheme" : "bearer"
}
2024-04-18 13:23:10 +08:00
} ,
2024-04-22 23:41:54 +08:00
"schemas" : {
"ScrapeResponse" : {
"type" : "object" ,
"properties" : {
"success" : {
"type" : "boolean"
} ,
"data" : {
"type" : "object" ,
"properties" : {
2024-04-25 01:11:44 +08:00
"markdown" : {
"type" : "string"
} ,
2024-04-22 23:41:54 +08:00
"content" : {
"type" : "string"
} ,
2024-05-16 03:11:16 +08:00
"html" : {
"type" : "string" ,
"nullable" : true ,
2024-06-27 08:22:46 +08:00
"description" : "Raw HTML content of the page if `includeHtml` is true"
2024-05-16 03:11:16 +08:00
} ,
2024-04-25 01:11:44 +08:00
"metadata" : {
"type" : "object" ,
"properties" : {
"title" : {
"type" : "string"
} ,
"description" : {
"type" : "string"
} ,
"language" : {
"type" : "string" ,
"nullable" : true
} ,
"sourceURL" : {
"type" : "string" ,
"format" : "uri"
2024-06-14 04:08:40 +08:00
} ,
2024-06-27 08:22:46 +08:00
"<any other metadata> " : {
"type" : "string"
2024-06-27 08:25:38 +08:00
} ,
"pageStatusCode" : {
"type" : "integer" ,
"description" : "The status code of the page"
} ,
"pageError" : {
"type" : "string" ,
"nullable" : true ,
"description" : "The error message of the page"
2024-04-25 01:11:44 +08:00
}
2024-06-27 08:22:46 +08:00
2024-04-25 01:11:44 +08:00
}
2024-05-21 08:10:55 +08:00
} ,
"llm_extraction" : {
"type" : "object" ,
"description" : "Displayed when using LLM Extraction. Extracted data from the page following the schema defined." ,
"nullable" : true
} ,
"warning" : {
"type" : "string" ,
"nullable" : true ,
"description" : "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
2024-04-25 01:11:44 +08:00
}
}
}
}
} ,
2024-05-17 02:03:32 +08:00
"CrawlStatusResponseObj" : {
"type" : "object" ,
"properties" : {
"markdown" : {
"type" : "string"
} ,
"content" : {
"type" : "string"
} ,
"html" : {
"type" : "string" ,
"nullable" : true ,
"description" : "Raw HTML content of the page if `includeHtml` is true"
} ,
2024-06-11 09:26:25 +08:00
"index" : {
"type" : "integer" ,
"description" : "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
} ,
2024-05-17 02:03:32 +08:00
"metadata" : {
"type" : "object" ,
"properties" : {
"title" : {
"type" : "string"
} ,
"description" : {
"type" : "string"
} ,
"language" : {
"type" : "string" ,
"nullable" : true
} ,
"sourceURL" : {
"type" : "string" ,
"format" : "uri"
2024-06-27 08:25:38 +08:00
} ,
"<any other metadata> " : {
"type" : "string"
} ,
"pageStatusCode" : {
"type" : "integer" ,
"description" : "The status code of the page"
} ,
"pageError" : {
"type" : "string" ,
"nullable" : true ,
"description" : "The error message of the page"
2024-05-17 02:03:32 +08:00
}
}
}
}
} ,
2024-04-25 01:11:44 +08:00
"SearchResponse" : {
"type" : "object" ,
"properties" : {
"success" : {
"type" : "boolean"
} ,
"data" : {
2024-04-26 04:28:07 +08:00
"type" : "array" ,
"items" : {
"type" : "object" ,
"properties" : {
"url" : {
"type" : "string"
} ,
"markdown" : {
"type" : "string"
} ,
"content" : {
"type" : "string"
} ,
"metadata" : {
"type" : "object" ,
"properties" : {
"title" : {
"type" : "string"
} ,
"description" : {
"type" : "string"
} ,
"language" : {
"type" : "string" ,
"nullable" : true
} ,
"sourceURL" : {
"type" : "string" ,
"format" : "uri"
}
2024-04-22 23:41:54 +08:00
}
}
}
}
}
}
} ,
"CrawlResponse" : {
"type" : "object" ,
"properties" : {
"jobId" : {
"type" : "string"
}
}
2024-04-18 13:23:10 +08:00
}
2024-04-22 23:41:54 +08:00
}
} ,
"security" : [
{
"bearerAuth" : [ ]
}
]
2024-06-27 08:22:46 +08:00
}