mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-15 19:22:19 +08:00
Merge branch 'main' into feat/issue-266
This commit is contained in:
commit
f9c7ca9388
|
@ -61,6 +61,13 @@
|
|||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
|
@ -194,6 +201,11 @@
|
|||
"type": "integer",
|
||||
"description": "Maximum number of pages to crawl",
|
||||
"default": 10000
|
||||
},
|
||||
"allowBackwardCrawling": {
|
||||
"type": "boolean",
|
||||
"description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -219,6 +231,13 @@
|
|||
"type": "object",
|
||||
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
|
|
|
@ -143,6 +143,55 @@ describe("E2E Tests for API Routes", () => {
|
|||
expect(response.body.data.metadata.pageError).toBeUndefined();
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post('/v0/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', pageOptions: { parsePDF: false } });
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
expect(response.body.data).toHaveProperty('content');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
|
||||
}, 60000); // 60 seconds
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||
const responseWithoutRemoveTags = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://www.scrapethissite.com/" });
|
||||
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
|
||||
expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v0/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
expect(response.body.data).toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.content).toContain("Scrape This Site");
|
||||
expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
|
||||
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
|
||||
expect(response.body.data.content).not.toContain("web scraping"); // strong
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
||||
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
||||
// const startTime = Date.now();
|
||||
|
|
|
@ -55,8 +55,16 @@ export async function crawlController(req: Request, res: Response) {
|
|||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {
|
||||
allowBackwardCrawling: false
|
||||
};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
removeTags: [],
|
||||
parsePDF: true
|
||||
};
|
||||
|
||||
if (mode === "single_urls" && !url.includes(",")) {
|
||||
try {
|
||||
|
|
|
@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||
|
||||
const job = await addWebScraperJob({
|
||||
url: url,
|
||||
|
|
|
@ -105,7 +105,13 @@ export async function scrapeController(req: Request, res: Response) {
|
|||
return res.status(status).json({ error });
|
||||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, waitFor: 0, screenshot: false };
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
parsePDF: true
|
||||
};
|
||||
const extractorOptions = req.body.extractorOptions ?? {
|
||||
mode: "markdown"
|
||||
}
|
||||
|
|
|
@ -85,6 +85,7 @@ export async function searchHelper(
|
|||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||
includeHtml: pageOptions?.includeHtml ?? false,
|
||||
removeTags: pageOptions?.removeTags ?? [],
|
||||
fallback: false,
|
||||
},
|
||||
});
|
||||
|
@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
|
|||
includeHtml: false,
|
||||
onlyMainContent: true,
|
||||
fetchPageContent: true,
|
||||
removeTags: [],
|
||||
fallback: false,
|
||||
};
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
|
|
@ -19,6 +19,8 @@ export type PageOptions = {
|
|||
screenshot?: boolean;
|
||||
headers?: Record<string, string>;
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
parsePDF?: boolean;
|
||||
removeTags?: string | string[];
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||
|
||||
export async function handleCustomScraping(
|
||||
text: string,
|
||||
url: string
|
||||
|
|
|
@ -280,7 +280,7 @@ export class WebScraperDataProvider {
|
|||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||
return Promise.all(
|
||||
pdfLinks.map(async (pdfLink) => {
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink);
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(pdfLink, this.pageOptions.parsePDF);
|
||||
return {
|
||||
content: content,
|
||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||
|
@ -475,7 +475,13 @@ export class WebScraperDataProvider {
|
|||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
|
||||
this.pageOptions = options.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
replaceAllPathsWithAbsolutePaths: false,
|
||||
parsePDF: true,
|
||||
removeTags: []
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||
|
|
|
@ -49,7 +49,7 @@ export async function scrapWithFireEngine(
|
|||
url: string,
|
||||
waitFor: number = 0,
|
||||
screenshot: boolean = false,
|
||||
pageOptions: { scrollXPaths?: string[] } = {},
|
||||
pageOptions: { scrollXPaths?: string[], parsePDF?: boolean } = { parsePDF: true },
|
||||
headers?: Record<string, string>,
|
||||
options?: any
|
||||
): Promise<FireEngineResponse> {
|
||||
|
@ -88,7 +88,7 @@ export async function scrapWithFireEngine(
|
|||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url);
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
return { html: content, screenshot: "", pageStatusCode, pageError };
|
||||
} else {
|
||||
const data = response.data;
|
||||
|
@ -109,7 +109,8 @@ export async function scrapWithFireEngine(
|
|||
export async function scrapWithScrapingBee(
|
||||
url: string,
|
||||
wait_browser: string = "domcontentloaded",
|
||||
timeout: number = universalTimeout
|
||||
timeout: number = universalTimeout,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
try {
|
||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||
|
@ -129,7 +130,8 @@ export async function scrapWithScrapingBee(
|
|||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return await fetchAndProcessPdf(url);
|
||||
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
|
||||
} else {
|
||||
let text = "";
|
||||
try {
|
||||
|
@ -149,7 +151,8 @@ export async function scrapWithScrapingBee(
|
|||
export async function scrapWithPlaywright(
|
||||
url: string,
|
||||
waitFor: number = 0,
|
||||
headers?: Record<string, string>
|
||||
headers?: Record<string, string>,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
|
@ -177,7 +180,7 @@ export async function scrapWithPlaywright(
|
|||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return await fetchAndProcessPdf(url);
|
||||
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
} else {
|
||||
const textData = response.data;
|
||||
try {
|
||||
|
@ -199,7 +202,10 @@ export async function scrapWithPlaywright(
|
|||
}
|
||||
}
|
||||
|
||||
export async function scrapWithFetch(url: string): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
export async function scrapWithFetch(
|
||||
url: string,
|
||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
|
@ -218,7 +224,7 @@ export async function scrapWithFetch(url: string): Promise<{ content: string, pa
|
|||
|
||||
const contentType = response.headers["content-type"];
|
||||
if (contentType && contentType.includes("application/pdf")) {
|
||||
return await fetchAndProcessPdf(url);
|
||||
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||
} else {
|
||||
const text = response.data;
|
||||
return { content: text, pageStatusCode: 200 };
|
||||
|
@ -309,6 +315,19 @@ export async function scrapSingleUrl(
|
|||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||
const soup = cheerio.load(html);
|
||||
soup("script, style, iframe, noscript, meta, head").remove();
|
||||
|
||||
if (pageOptions.removeTags) {
|
||||
if (typeof pageOptions.removeTags === 'string') {
|
||||
pageOptions.removeTags.split(',').forEach((tag) => {
|
||||
soup(tag.trim()).remove();
|
||||
});
|
||||
} else if (Array.isArray(pageOptions.removeTags)) {
|
||||
pageOptions.removeTags.forEach((tag) => {
|
||||
soup(tag).remove();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (pageOptions.onlyMainContent) {
|
||||
// remove any other tags that are not in the main content
|
||||
excludeNonMainTags.forEach((tag) => {
|
||||
|
@ -390,7 +409,7 @@ export async function scrapSingleUrl(
|
|||
}
|
||||
break;
|
||||
case "pdf":
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url);
|
||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF);
|
||||
customScrapedContent = { html: content, screenshot, pageStatusCode, pageError }
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ import * as pdfProcessor from '../pdfProcessor';
|
|||
describe('PDF Processing Module - Integration Test', () => {
|
||||
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
|
||||
delete process.env.LLAMAPARSE_API_KEY;
|
||||
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
|
||||
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
|
||||
expect(content.trim()).toEqual("Dummy PDF file");
|
||||
expect(pageStatusCode).toEqual(200);
|
||||
expect(pageError).toBeUndefined();
|
||||
|
|
|
@ -9,9 +9,9 @@ import os from "os";
|
|||
|
||||
dotenv.config();
|
||||
|
||||
export async function fetchAndProcessPdf(url: string): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath);
|
||||
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
return { content, pageStatusCode, pageError };
|
||||
}
|
||||
|
@ -34,10 +34,10 @@ async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageSta
|
|||
});
|
||||
}
|
||||
|
||||
export async function processPdfToText(filePath: string): Promise<string> {
|
||||
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
|
||||
let content = "";
|
||||
|
||||
if (process.env.LLAMAPARSE_API_KEY) {
|
||||
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
||||
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
||||
const headers = {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
|
@ -95,8 +95,10 @@ export async function processPdfToText(filePath: string): Promise<string> {
|
|||
console.error("Error processing pdf document w/ LlamaIndex(2)");
|
||||
content = await processPdf(filePath);
|
||||
}
|
||||
} else {
|
||||
} else if (parsePDF) {
|
||||
content = await processPdf(filePath);
|
||||
} else {
|
||||
content = fs.readFileSync(filePath, "utf-8");
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
|
|
@ -1,3 +1,57 @@
|
|||
"""
|
||||
This is the Firecrawl package.
|
||||
|
||||
This package provides a Python SDK for interacting with the Firecrawl API.
|
||||
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
||||
and check the status of these jobs.
|
||||
|
||||
For more information visit https://github.com/firecrawl/
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
from .firecrawl import FirecrawlApp
|
||||
|
||||
__version__ = "0.0.14"
|
||||
__version__ = "0.0.16"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
||||
|
||||
def _basic_config() -> None:
|
||||
"""Set up basic configuration for logging with a specific format and date format."""
|
||||
try:
|
||||
logging.basicConfig(
|
||||
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Failed to configure logging: %s", e)
|
||||
|
||||
|
||||
def setup_logging() -> None:
|
||||
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
||||
env = os.environ.get(
|
||||
"FIRECRAWL_LOGGING_LEVEL", "INFO"
|
||||
).upper() # Default to 'INFO' level
|
||||
_basic_config()
|
||||
|
||||
if env == "DEBUG":
|
||||
logger.setLevel(logging.DEBUG)
|
||||
elif env == "INFO":
|
||||
logger.setLevel(logging.INFO)
|
||||
elif env == "WARNING":
|
||||
logger.setLevel(logging.WARNING)
|
||||
elif env == "ERROR":
|
||||
logger.setLevel(logging.ERROR)
|
||||
elif env == "CRITICAL":
|
||||
logger.setLevel(logging.CRITICAL)
|
||||
else:
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
||||
|
||||
|
||||
# Initialize logging configuration when the module is imported
|
||||
setup_logging()
|
||||
logger.debug("Debugging logger setup")
|
||||
|
|
Binary file not shown.
|
@ -27,14 +27,14 @@ def test_scrape_url_invalid_api_key():
|
|||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.scrape_url('https://firecrawl.dev')
|
||||
assert "Failed to scrape URL. Status code: 401" in str(excinfo.value)
|
||||
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url():
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.scrape_url(blocklisted_url)
|
||||
assert "Failed to scrape URL. Status code: 403" in str(excinfo.value)
|
||||
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||
|
@ -86,14 +86,14 @@ def test_crawl_url_invalid_api_key():
|
|||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.crawl_url('https://firecrawl.dev')
|
||||
assert "Unexpected error occurred while trying to start crawl job. Status code: 401" in str(excinfo.value)
|
||||
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_should_return_error_for_blocklisted_url():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
blocklisted_url = "https://twitter.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url(blocklisted_url)
|
||||
assert "Unexpected error occurred while trying to start crawl job. Status code: 403" in str(excinfo.value)
|
||||
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_crawl_url_wait_for_completion_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
|
@ -114,7 +114,7 @@ def test_crawl_url_with_idempotency_key_e2e():
|
|||
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
||||
assert "Failed to start crawl job. Status code: 409. Error: Idempotency key already used" in str(excinfo.value)
|
||||
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
||||
|
||||
def test_check_crawl_status_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
|
@ -141,7 +141,7 @@ def test_search_invalid_api_key():
|
|||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.search("test query")
|
||||
assert "Failed to search. Status code: 401" in str(excinfo.value)
|
||||
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_llm_extraction():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
|
|
|
@ -9,13 +9,14 @@ and handles retries for certain HTTP status codes.
|
|||
Classes:
|
||||
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
|
||||
logger : logging.Logger = logging.getLogger("firecrawl")
|
||||
|
||||
class FirecrawlApp:
|
||||
"""
|
||||
|
@ -28,8 +29,15 @@ class FirecrawlApp:
|
|||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||
if self.api_key is None:
|
||||
logger.warning("No API key provided")
|
||||
raise ValueError('No API key provided')
|
||||
else:
|
||||
logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
|
||||
|
||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||
if self.api_url != 'https://api.firecrawl.dev':
|
||||
logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
|
||||
|
||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
"""
|
||||
Scrape the specified URL using the Firecrawl API.
|
||||
|
@ -45,10 +53,8 @@ class FirecrawlApp:
|
|||
Exception: If the scrape request fails.
|
||||
"""
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
}
|
||||
headers = self._prepare_headers()
|
||||
|
||||
# Prepare the base scrape parameters with the URL
|
||||
scrape_params = {'url': url}
|
||||
|
||||
|
@ -81,13 +87,10 @@ class FirecrawlApp:
|
|||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||
elif response.status_code in [402, 408, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
||||
self._handle_error(response, 'scrape URL')
|
||||
|
||||
def search(self, query, params=None):
|
||||
def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
"""
|
||||
Perform a search using the Firecrawl API.
|
||||
|
||||
|
@ -101,10 +104,7 @@ class FirecrawlApp:
|
|||
Raises:
|
||||
Exception: If the search request fails.
|
||||
"""
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
}
|
||||
headers = self._prepare_headers()
|
||||
json_data = {'query': query}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
|
@ -121,13 +121,14 @@ class FirecrawlApp:
|
|||
else:
|
||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||
|
||||
elif response.status_code in [402, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
||||
self._handle_error(response, 'search')
|
||||
|
||||
def crawl_url(self, url, params=None, wait_until_done=True, poll_interval=2, idempotency_key=None):
|
||||
def crawl_url(self, url: str,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
wait_until_done: bool = True,
|
||||
poll_interval: int = 2,
|
||||
idempotency_key: Optional[str] = None) -> Any:
|
||||
"""
|
||||
Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||
|
||||
|
@ -158,7 +159,7 @@ class FirecrawlApp:
|
|||
else:
|
||||
self._handle_error(response, 'start crawl job')
|
||||
|
||||
def check_crawl_status(self, job_id):
|
||||
def check_crawl_status(self, job_id: str) -> Any:
|
||||
"""
|
||||
Check the status of a crawl job using the Firecrawl API.
|
||||
|
||||
|
@ -178,7 +179,7 @@ class FirecrawlApp:
|
|||
else:
|
||||
self._handle_error(response, 'check crawl status')
|
||||
|
||||
def _prepare_headers(self, idempotency_key=None):
|
||||
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
||||
|
@ -200,7 +201,11 @@ class FirecrawlApp:
|
|||
'Authorization': f'Bearer {self.api_key}',
|
||||
}
|
||||
|
||||
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
|
||||
def _post_request(self, url: str,
|
||||
data: Dict[str, Any],
|
||||
headers: Dict[str, str],
|
||||
retries: int = 3,
|
||||
backoff_factor: float = 0.5) -> requests.Response:
|
||||
"""
|
||||
Make a POST request with retries.
|
||||
|
||||
|
@ -225,7 +230,10 @@ class FirecrawlApp:
|
|||
return response
|
||||
return response
|
||||
|
||||
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
|
||||
def _get_request(self, url: str,
|
||||
headers: Dict[str, str],
|
||||
retries: int = 3,
|
||||
backoff_factor: float = 0.5) -> requests.Response:
|
||||
"""
|
||||
Make a GET request with retries.
|
||||
|
||||
|
@ -249,7 +257,7 @@ class FirecrawlApp:
|
|||
return response
|
||||
return response
|
||||
|
||||
def _monitor_job_status(self, job_id, headers, poll_interval):
|
||||
def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any:
|
||||
"""
|
||||
Monitor the status of a crawl job until completion.
|
||||
|
||||
|
@ -281,7 +289,7 @@ class FirecrawlApp:
|
|||
else:
|
||||
self._handle_error(status_response, 'check crawl status')
|
||||
|
||||
def _handle_error(self, response, action):
|
||||
def _handle_error(self, response: requests.Response, action: str) -> None:
|
||||
"""
|
||||
Handle errors from API responses.
|
||||
|
||||
|
@ -292,8 +300,19 @@ class FirecrawlApp:
|
|||
Raises:
|
||||
Exception: An exception with a message containing the status code and error details from the response.
|
||||
"""
|
||||
if response.status_code in [402, 408, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
||||
error_message = response.json().get('error', 'No additional error details provided.')
|
||||
|
||||
if response.status_code == 402:
|
||||
message = f"Payment Required: Failed to {action}. {error_message}"
|
||||
elif response.status_code == 408:
|
||||
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}"
|
||||
elif response.status_code == 409:
|
||||
message = f"Conflict: Failed to {action} due to a conflict. {error_message}"
|
||||
elif response.status_code == 500:
|
||||
message = f"Internal Server Error: Failed to {action}. {error_message}"
|
||||
else:
|
||||
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
|
||||
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}"
|
||||
|
||||
# Raise an HTTPError with the custom message and attach the response
|
||||
raise requests.exceptions.HTTPError(message, response=response)
|
||||
|
Loading…
Reference in New Issue
Block a user