mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
wip
This commit is contained in:
parent
6cdf4c68ec
commit
d599d31e63
|
@ -1,4 +1,4 @@
|
|||
import FirecrawlApp from '../../index';
|
||||
import FirecrawlApp, { CrawlResponseV0, FirecrawlDocumentV0, JobStatusResponseV0, ScrapeResponseV0, SearchResponseV0 } from '../../index';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import dotenv from 'dotenv';
|
||||
import { describe, test, expect } from '@jest/globals';
|
||||
|
@ -11,31 +11,31 @@ const API_URL = "http://127.0.0.1:3002";
|
|||
describe('FirecrawlApp E2E Tests', () => {
|
||||
test.concurrent('should throw error for no API key', async () => {
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL, version: "v0" });
|
||||
}).toThrow("No API key provided");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" });
|
||||
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response with valid preview token', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai');
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai');
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
expect(response.data).toHaveProperty('markdown');
|
||||
|
@ -44,8 +44,8 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response with valid API key and include HTML', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } });
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }) as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
expect(response.data?.markdown).toContain("_Roast_");
|
||||
|
@ -53,41 +53,41 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf');
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001');
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" });
|
||||
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30);
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30) as CrawlResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response[0].content).toContain("_Roast_");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should handle idempotency key for crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const uniqueIdempotencyKey = uuidv4();
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey);
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey) as CrawlResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
|
@ -95,12 +95,12 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
});
|
||||
|
||||
test.concurrent('should check crawl status', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false);
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as JobStatusResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
let statusResponse = await app.checkCrawlStatus(response.jobId);
|
||||
let statusResponse: any = await app.checkCrawlStatus(response.jobId);
|
||||
const maxChecks = 15;
|
||||
let checks = 0;
|
||||
|
||||
|
@ -108,7 +108,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
expect(statusResponse.partial_data).not.toBeNull();
|
||||
expect(statusResponse.current).toBeGreaterThanOrEqual(1);
|
||||
statusResponse = await app.checkCrawlStatus(response.jobId);
|
||||
statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponseV0;
|
||||
checks++;
|
||||
}
|
||||
|
||||
|
@ -121,20 +121,20 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
}, 35000); // 35 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for search', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.search("test query");
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.search("test query") as SearchResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response?.data?.[0]?.content).toBeDefined();
|
||||
expect(response?.data?.length).toBeGreaterThan(2);
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on search', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL, version: "v0" });
|
||||
await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should perform LLM extraction', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL, version: "v0" });
|
||||
const response = await app.scrapeUrl("https://mendable.ai", {
|
||||
extractorOptions: {
|
||||
mode: 'llm-extraction',
|
||||
|
@ -149,7 +149,7 @@ describe('FirecrawlApp E2E Tests', () => {
|
|||
required: ['company_mission', 'supports_sso', 'is_open_source']
|
||||
}
|
||||
}
|
||||
});
|
||||
}) as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.llm_extraction).toBeDefined();
|
||||
const llmExtraction = response.data?.llm_extraction;
|
||||
|
|
|
@ -0,0 +1,122 @@
|
|||
import FirecrawlApp, { CrawlResponse, JobStatusResponse, ScrapeResponse } from '../../../index';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import dotenv from 'dotenv';
|
||||
import { describe, test, expect } from '@jest/globals';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const TEST_API_KEY = process.env.TEST_API_KEY;
|
||||
const API_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe('FirecrawlApp E2E Tests', () => {
|
||||
test.concurrent('should throw error for no API key', async () => {
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
||||
}).toThrow("No API key provided");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response with valid preview token', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
expect(response.data).toHaveProperty('markdown');
|
||||
expect(response.data).toHaveProperty('metadata');
|
||||
expect(response.data).not.toHaveProperty('html');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response with valid API key and include HTML', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } }) as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
expect(response.data?.markdown).toContain("_Roast_");
|
||||
expect(response.data?.html).toContain("<h1");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30) as JobStatusResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.[0].content).toContain("_Roast_");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should handle idempotency key for crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const uniqueIdempotencyKey = uuidv4();
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey) as CrawlResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
await expect(app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
|
||||
});
|
||||
|
||||
test.concurrent('should check crawl status', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response: any = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false) as JobStatusResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
let statusResponse: any = await app.checkCrawlStatus(response.jobId);
|
||||
const maxChecks = 15;
|
||||
let checks = 0;
|
||||
|
||||
while (statusResponse.status === 'active' && checks < maxChecks) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
expect(statusResponse.partial_data).not.toBeNull();
|
||||
expect(statusResponse.current).toBeGreaterThanOrEqual(1);
|
||||
statusResponse = await app.checkCrawlStatus(response.jobId) as CrawlResponse;
|
||||
checks++;
|
||||
}
|
||||
|
||||
expect(statusResponse).not.toBeNull();
|
||||
expect(statusResponse.success).toBe(true);
|
||||
expect(statusResponse.status).toBe('completed');
|
||||
expect(statusResponse.total).toEqual(statusResponse.current);
|
||||
expect(statusResponse.current_step).not.toBeNull();
|
||||
expect(statusResponse?.data?.length).toBeGreaterThan(0);
|
||||
}, 35000); // 35 seconds timeout
|
||||
});
|
Loading…
Reference in New Issue
Block a user