Merge remote-tracking branch 'origin/main' into pr/765

This commit is contained in:
rafaelsideguide 2024-10-29 11:02:23 -03:00
commit d301c1bf0f
68 changed files with 4774 additions and 272 deletions

View File

@ -1,20 +0,0 @@
name: Check Queues
on:
schedule:
- cron: '*/5 * * * *'
env:
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
jobs:
clean-jobs:
runs-on: ubuntu-latest
steps:
- name: Send GET request to check queues
run: |
response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/check-queues)
if [ "$response" -ne 200 ]; then
echo "Failed to check queues. Response: $response"
exit 1
fi
echo "Successfully checked queues. Response: $response"

2
.gitignore vendored
View File

@ -28,3 +28,5 @@ apps/js-sdk/firecrawl/dist
/examples/o1_web_crawler/firecrawl_env
/examples/crm_lead_enrichment/crm_lead_enrichment_env
/.venv
/examples/claude_web_crawler/firecrawl_env

View File

@ -1,4 +1,5 @@
<h3 align="center">
<a name="readme-top"></a>
<img
src="https://raw.githubusercontent.com/mendableai/firecrawl/main/img/firecrawl_logo.png"
height="200"
@ -79,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
- **Media parsing**: pdfs, docx, images.
- **Reliability first**: designed to get the data you need - no matter how hard it is.
- **Actions**: click, scroll, input, wait and more before extracting data
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
@ -349,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
}'
```
### Batch Scraping Multiple URLs (New)
You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape.
```bash
curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-d '{
"urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"],
"formats" : ["markdown", "html"]
}'
```
### Search (v0) (Beta)
@ -482,7 +497,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
scrapeOptions: {
formats: ['markdown', 'html'],
}
} as CrawlParams, true, 30) as CrawlStatusResponse;
} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse;
if (crawlResponse) {
console.log(crawlResponse)
@ -541,6 +556,12 @@ We love contributions! Please read our [contributing guide](CONTRIBUTING.md) bef
_It is the sole responsibility of the end users to respect websites' policies when scraping, searching and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions._
## Contributors
<a href="https://github.com/mendableai/firecrawl/graphs/contributors">
<img alt="contributors" src="https://contrib.rocks/image?repo=mendableai/firecrawl"/>
</a>
## License Disclaimer
This project is primarily licensed under the GNU Affero General Public License v3.0 (AGPL-3.0), as specified in the LICENSE file in the root directory of this repository. However, certain components of this project are licensed under the MIT License. Refer to the LICENSE files in these specific directories for details.
@ -552,3 +573,10 @@ Please note:
- When using or contributing to this project, ensure you comply with the appropriate license terms for the specific component you are working with.
For more details on the licensing of specific components, please refer to the LICENSE files in the respective directories or contact the project maintainers.
<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
<a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
↑ Back to Top ↑
</a>
</p>

View File

@ -36,7 +36,7 @@ Self-hosting Firecrawl is ideal for those who need full control over their scrap
Create an `.env` in the root directory you can copy over the template in `apps/api/.env.example`
To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features)
To start, we won't set up authentication or any optional subservices (pdf parsing, JS blocking support, AI features)
`.env:`
```
@ -47,7 +47,7 @@ HOST=0.0.0.0
REDIS_URL=redis://redis:6379
REDIS_RATE_LIMIT_URL=redis://redis:6379
## To turn on DB authentication, you need to set up supabase.
## To turn on DB authentication, you need to set up Supabase.
USE_DB_AUTHENTICATION=false
# ===== Optional ENVS ======
@ -59,8 +59,8 @@ SUPABASE_SERVICE_TOKEN=
# Other Optionals
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
@ -176,4 +176,4 @@ By addressing these common issues, you can ensure a smoother setup and operation
## Install Firecrawl on a Kubernetes Cluster (Simple Version)
Read the [examples/kubernetes/cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes/cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
Read the [examples/kubernetes/cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes/cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.

View File

@ -1,5 +1,5 @@
# ===== Required ENVS ======
NUM_WORKERS_PER_QUEUE=8
NUM_WORKERS_PER_QUEUE=8
PORT=3002
HOST=0.0.0.0
REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
@ -11,9 +11,14 @@ USE_DB_AUTHENTICATION=true
# ===== Optional ENVS ======
# SearchApi key. Head to https://searchapi.com/ to get your API key
SEARCHAPI_API_KEY=
# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
SEARCHAPI_ENGINE=
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
SUPABASE_ANON_TOKEN=
SUPABASE_URL=
SUPABASE_ANON_TOKEN=
SUPABASE_URL=
SUPABASE_SERVICE_TOKEN=
# Other Optionals

View File

@ -12,4 +12,4 @@ ANTHROPIC_API_KEY=
BULL_AUTH_KEY=
LOGTAIL_KEY=
PLAYWRIGHT_MICROSERVICE_URL=
SEARCHAPI_API_KEY=

View File

@ -121,6 +121,49 @@ describe("E2E Tests for v1 API Routes", () => {
},
30000
); // 30 seconds timeout
it.concurrent(
"should return a successful response with a valid API key",
async () => {
const scrapeRequest: ScrapeRequest = {
url: "https://arxiv.org/abs/2410.04840",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("content");
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("metadata");
expect(response.body.data).not.toHaveProperty("html");
expect(response.body.data.markdown).toContain("Strong Model Collapse");
expect(response.body.data.metadata.error).toBeUndefined();
expect(response.body.data.metadata.description).toContain("Abstract page for arXiv paper 2410.04840: Strong Model Collapse");
expect(response.body.data.metadata.citation_title).toBe("Strong Model Collapse");
expect(response.body.data.metadata.citation_author).toEqual([
"Dohmatob, Elvis",
"Feng, Yunzhen",
"Subramonian, Arjun",
"Kempe, Julia"
]);
expect(response.body.data.metadata.citation_date).toBe("2024/10/07");
expect(response.body.data.metadata.citation_online_date).toBe("2024/10/08");
expect(response.body.data.metadata.citation_pdf_url).toBe("http://arxiv.org/pdf/2410.04840");
expect(response.body.data.metadata.citation_arxiv_id).toBe("2410.04840");
expect(response.body.data.metadata.citation_abstract).toContain("Within the scaling laws paradigm");
expect(response.body.data.metadata.sourceURL).toBe("https://arxiv.org/abs/2410.04840");
expect(response.body.data.metadata.statusCode).toBe(200);
},
30000
);
it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true",
async () => {

View File

@ -13,7 +13,7 @@ import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
import { sendNotification } from "../services/notification/email_notification";
import { Logger } from "../lib/logger";
import { redlock } from "../services/redlock";
import { getValue } from "../services/redis";
import { deleteKey, getValue } from "../services/redis";
import { setValue } from "../services/redis";
import { validate } from "uuid";
import * as Sentry from "@sentry/node";
@ -37,12 +37,17 @@ function normalizedApiIsUuid(potentialUuid: string): boolean {
return validate(potentialUuid);
}
export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)) {
export async function setCachedACUC(
api_key: string,
acuc:
| AuthCreditUsageChunk
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)
) {
const cacheKeyACUC = `acuc_${api_key}`;
const redLockKey = `lock_${cacheKeyACUC}`;
try {
await redlock.using([redLockKey], 10000, {}, async signal => {
await redlock.using([redLockKey], 10000, {}, async (signal) => {
if (typeof acuc === "function") {
acuc = acuc(JSON.parse(await getValue(cacheKeyACUC)));
@ -68,31 +73,60 @@ export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk
}
}
export async function getACUC(api_key: string, cacheOnly = false): Promise<AuthCreditUsageChunk | null> {
export async function getACUC(
api_key: string,
cacheOnly = false,
useCache = true
): Promise<AuthCreditUsageChunk | null> {
const cacheKeyACUC = `acuc_${api_key}`;
const cachedACUC = await getValue(cacheKeyACUC);
if (useCache) {
const cachedACUC = await getValue(cacheKeyACUC);
if (cachedACUC !== null) {
return JSON.parse(cachedACUC);
}
}
if (cachedACUC !== null) {
return JSON.parse(cachedACUC);
} else if (!cacheOnly) {
const { data, error } =
await supabase_service.rpc("auth_credit_usage_chunk", { input_key: api_key });
if (error) {
throw new Error("Failed to retrieve authentication and credit usage data: " + JSON.stringify(error));
if (!cacheOnly) {
let data;
let error;
let retries = 0;
const maxRetries = 5;
while (retries < maxRetries) {
({ data, error } = await supabase_service.rpc(
"auth_credit_usage_chunk_test_21_credit_pack",
{ input_key: api_key }
));
if (!error) {
break;
}
Logger.warn(
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
);
retries++;
if (retries === maxRetries) {
throw new Error(
"Failed to retrieve authentication and credit usage data after 3 attempts: " +
JSON.stringify(error)
);
}
// Wait for a short time before retrying
await new Promise((resolve) => setTimeout(resolve, 200));
}
const chunk: AuthCreditUsageChunk | null = data.length === 0
? null
: data[0].team_id === null
? null
: data[0];
const chunk: AuthCreditUsageChunk | null =
data.length === 0 ? null : data[0].team_id === null ? null : data[0];
// NOTE: Should we cache null chunks? - mogery
if (chunk !== null) {
if (chunk !== null && useCache) {
setCachedACUC(api_key, chunk);
}
// console.log(chunk);
return chunk;
} else {
@ -100,6 +134,13 @@ export async function getACUC(api_key: string, cacheOnly = false): Promise<AuthC
}
}
export async function clearACUC(
api_key: string,
): Promise<void> {
const cacheKeyACUC = `acuc_${api_key}`;
await deleteKey(cacheKeyACUC);
}
export async function authenticateUser(
req,
res,
@ -132,7 +173,11 @@ export async function supaAuthenticateUser(
plan?: PlanType;
chunk?: AuthCreditUsageChunk;
}> {
const authHeader = req.headers.authorization ?? (req.headers["sec-websocket-protocol"] ? `Bearer ${req.headers["sec-websocket-protocol"]}` : null);
const authHeader =
req.headers.authorization ??
(req.headers["sec-websocket-protocol"]
? `Bearer ${req.headers["sec-websocket-protocol"]}`
: null);
if (!authHeader) {
return { success: false, error: "Unauthorized", status: 401 };
}
@ -162,7 +207,7 @@ export async function supaAuthenticateUser(
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
} else {
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
}
}
teamId = "preview";
} else {
normalizedApi = parseApi(token);

View File

@ -0,0 +1,22 @@
import { Request, Response } from "express";
import { supabase_service } from "../../../services/supabase";
import { clearACUC } from "../../auth";
import { Logger } from "../../../lib/logger";
export async function acucCacheClearController(req: Request, res: Response) {
try {
const team_id: string = req.body.team_id;
const keys = await supabase_service
.from("api_keys")
.select("*")
.eq("team_id", team_id);
await Promise.all(keys.data.map((x) => clearACUC(x.key)));
res.json({ ok: true });
} catch (error) {
Logger.error(`Error clearing ACUC cache via API route: ${error}`);
res.status(500).json({ error: "Internal server error" });
}
}

View File

@ -60,7 +60,7 @@ export async function crawlStatusController(req: Request, res: Response) {
}));
// Filter out failed jobs
jobsWithStatuses = jobsWithStatuses.filter(x => x.status !== "failed");
jobsWithStatuses = jobsWithStatuses.filter(x => x.status !== "failed" && x.status !== "unknown");
// Sort jobs by timestamp
jobsWithStatuses.sort((a, b) => a.job.timestamp - b.job.timestamp);

View File

@ -0,0 +1,103 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
BatchScrapeRequest,
batchScrapeRequestSchema,
CrawlResponse,
legacyExtractorOptions,
legacyScrapeOptions,
RequestWithAuth,
} from "./types";
import {
addCrawlJobs,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { getJobPriority } from "../../lib/job-priority";
export async function batchScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
res: Response<CrawlResponse>
) {
req.body = batchScrapeRequestSchema.parse(req.body);
const id = uuidv4();
await logCrawl(id, req.auth.team_id);
let { remainingCredits } = req.account;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if(!useDbAuthentication){
remainingCredits = Infinity;
}
const pageOptions = legacyScrapeOptions(req.body);
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
const sc: StoredCrawl = {
crawlerOptions: null,
pageOptions,
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
};
await saveCrawl(id, sc);
let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if(req.body.urls.length > 1000){
// set base to 21
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
}
const jobs = req.body.urls.map((x) => {
const uuid = uuidv4();
return {
name: uuid,
data: {
url: x,
mode: "single_urls",
team_id: req.auth.team_id,
plan: req.auth.plan,
crawlerOptions: null,
pageOptions,
extractorOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
v1: true,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});
await lockURLs(
id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs);
const protocol = process.env.ENV === "local" ? req.protocol : "https";
return res.status(200).json({
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
});
}

View File

@ -97,12 +97,23 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
let jobIDs = await getCrawlJobs(req.params.jobId);
let jobStatuses = await Promise.all(jobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)] as const));
const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id));
jobStatuses = jobStatuses.filter(x => !throttledJobs.has(x[0])); // throttled jobs can have a failed status, but they are not actually failed
// filter out failed jobs
jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed"));
// filter the job statues
jobStatuses = jobStatuses.filter(x => x[1] !== "failed");
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping";
const throttledJobsSet = new Set(throttledJobs);
const validJobStatuses = [];
const validJobIDs = [];
for (const [id, status] of jobStatuses) {
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
validJobStatuses.push([id, status]);
validJobIDs.push(id);
}
}
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : validJobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping";
jobIDs = validJobIDs; // Use validJobIDs instead of jobIDs for further processing
const doneJobs = await getJobs(doneJobIDs);
const data = doneJobs.map(x => x.returnvalue);

View File

@ -44,7 +44,7 @@ export async function getJobs(ids: string[]) {
return jobs;
}
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>, isBatch = false) {
const sc = await getCrawl(req.params.jobId);
if (!sc) {
return res.status(404).json({ success: false, error: "Job not found" });
@ -60,12 +60,24 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
let jobIDs = await getCrawlJobs(req.params.jobId);
let jobStatuses = await Promise.all(jobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)] as const));
const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id));
jobStatuses = jobStatuses.filter(x => !throttledJobs.has(x[0])); // throttled jobs can have a failed status, but they are not actually failed
// filter out failed jobs
jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed"));
// filter the job statues
jobStatuses = jobStatuses.filter(x => x[1] !== "failed");
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping";
const throttledJobsSet = new Set(throttledJobs);
const validJobStatuses = [];
const validJobIDs = [];
for (const [id, status] of jobStatuses) {
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
validJobStatuses.push([id, status]);
validJobIDs.push(id);
}
}
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : validJobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping";
// Use validJobIDs instead of jobIDs for further processing
jobIDs = validJobIDs;
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
@ -100,7 +112,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
const data = doneJobs.map(x => x.returnvalue);
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`);
nextURL.searchParams.set("skip", (start + data.length).toString());

View File

@ -78,7 +78,7 @@ export async function crawlController(
const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
} catch (e) {
Logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(

View File

@ -63,7 +63,7 @@ export async function mapController(
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
const cacheKey = `fireEngineMap:${mapUrl}`;
const cachedResult = await redis.get(cacheKey);
const cachedResult = null;
let allResults: any[];
let pagePromises: Promise<any>[];

View File

@ -139,7 +139,7 @@ export async function scrapeController(
crawlerOptions: {},
pageOptions: pageOptions,
origin: origin,
extractor_options: { mode: "markdown" },
extractor_options: extractorOptions,
num_tokens: numTokens,
});

View File

@ -4,6 +4,7 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types";
import { countries } from "../../lib/validate-country";
export type Format =
| "markdown"
@ -108,6 +109,28 @@ export const scrapeOptions = z.object({
extract: extractOptions.optional(),
parsePDF: z.boolean().default(true),
actions: actionsSchema.optional(),
// New
location: z.object({
country: z.string().optional().refine(
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
{
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
}
).transform(val => val ? val.toUpperCase() : 'US'),
languages: z.string().array().optional(),
}).optional(),
// Deprecated
geolocation: z.object({
country: z.string().optional().refine(
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
{
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
}
).transform(val => val ? val.toUpperCase() : 'US'),
languages: z.string().array().optional(),
}).optional(),
skipTlsVerification: z.boolean().default(false),
}).strict(strictMessage)
@ -132,19 +155,29 @@ export const scrapeRequestSchema = scrapeOptions.extend({
return obj;
});
// export type ScrapeRequest = {
// url: string;
// formats?: Format[];
// headers?: { [K: string]: string };
// includeTags?: string[];
// excludeTags?: string[];
// onlyMainContent?: boolean;
// timeout?: number;
// waitFor?: number;
// }
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export const batchScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(),
origin: z.string().optional().default("api"),
}).strict(strictMessage).refine(
(obj) => {
const hasExtractFormat = obj.formats?.includes("extract");
const hasExtractOptions = obj.extract !== undefined;
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
},
{
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
}
).transform((obj) => {
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
return { ...obj, timeout: 60000 };
}
return obj;
});
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
const crawlerOptions = z.object({
includePaths: z.string().array().default([]),
excludePaths: z.string().array().default([]),
@ -250,6 +283,8 @@ export type Document = {
sourceURL?: string;
statusCode?: number;
error?: string;
[key: string]: string | string[] | number | undefined;
};
};
@ -340,6 +375,8 @@ export type AuthCreditUsageChunk = {
coupons: any[];
adjusted_credits_used: number; // credits this period minus coupons used
remaining_credits: number;
sub_user_id: string | null;
total_credits_sum: number;
};
export interface RequestWithMaybeACUC<
@ -421,6 +458,8 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery
geolocation: x.location ?? x.geolocation,
skipTlsVerification: x.skipTlsVerification
};
}

View File

@ -20,6 +20,7 @@ import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
import { ZodError } from "zod";
import { v4 as uuidv4 } from "uuid";
import dns from 'node:dns';
const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter");
@ -28,13 +29,13 @@ const { ExpressAdapter } = require("@bull-board/express");
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
Logger.info(`Number of CPUs: ${numCPUs} available`);
const cacheable = new CacheableLookup({
// this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme
lookup:false
});
const cacheable = new CacheableLookup()
// Install cacheable lookup for all other requests
cacheable.install(http.globalAgent);
cacheable.install(https.globalAgent)
cacheable.install(https.globalAgent);
const ws = expressWs(express());
const app = ws.app;

View File

@ -6,7 +6,13 @@ export function numTokensFromString(message: string, model: string): number {
const encoder = encoding_for_model(model as TiktokenModel);
// Encode the message into tokens
const tokens = encoder.encode(message);
let tokens: Uint32Array;
try {
tokens = encoder.encode(message);
} catch (error) {
message = message.replace("<|endoftext|>", "");
tokens = encoder.encode(message);
}
// Free the encoder resources after use
encoder.free();

View File

@ -3,7 +3,7 @@ import { redisConnection } from "../services/queue-service";
import { Logger } from "./logger";
export type StoredCrawl = {
originUrl: string;
originUrl?: string;
crawlerOptions: any;
pageOptions: any;
team_id: string;

View File

@ -51,6 +51,10 @@ export type PageOptions = {
disableJsDom?: boolean; // beta
atsv?: boolean; // anti-bot solver, beta
actions?: Action[]; // beta
geolocation?: {
country?: string;
};
skipTlsVerification?: boolean;
};
export type ExtractorOptions = {

File diff suppressed because it is too large Load Diff

View File

@ -112,7 +112,7 @@ export async function runWebScraper({
}
// remove docs with empty content
const filteredDocs = crawlerOptions.returnOnlyUrls
const filteredDocs = crawlerOptions?.returnOnlyUrls
? docs.map((doc) => {
if (doc.metadata.sourceURL) {
return { url: doc.metadata.sourceURL };
@ -121,8 +121,13 @@ export async function runWebScraper({
: docs;
if(is_scrape === false) {
billTeam(team_id, undefined, filteredDocs.length).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
let creditsToBeBilled = 1; // Assuming 1 credit per document
if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
creditsToBeBilled = 5;
}
billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
}

View File

@ -6,6 +6,8 @@ import {
cleanBefore24hCompleteJobsController,
queuesController,
} from "../controllers/v0/admin/queue";
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
import { wrap } from "./v1";
export const adminRouter = express.Router();
@ -33,3 +35,8 @@ adminRouter.get(
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
autoscalerController
);
adminRouter.post(
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
wrap(acucCacheClearController),
);

View File

@ -17,6 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel";
import { Logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status";
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
import { batchScrapeController } from "../controllers/v1/batch-scrape";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search";
@ -29,14 +30,14 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
return (req, res, next) => {
(async () => {
if (!minimum && req.body) {
minimum = (req.body as any)?.limit ?? 1;
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
}
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
req.acuc = chunk;
if (!success) {
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
if (!res.headersSent) {
return res.status(402).json({ success: false, error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing." });
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
}
}
req.account = { remainingCredits };
@ -94,7 +95,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
next();
}
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
export function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
return (req, res, next) => {
controller(req, res)
.catch(err => next(err))
@ -122,6 +123,15 @@ v1Router.post(
wrap(crawlController)
);
v1Router.post(
"/batch/scrape",
authMiddleware(RateLimiterMode.Crawl),
checkCreditsMiddleware(),
blocklistMiddleware,
idempotencyMiddleware,
wrap(batchScrapeController)
);
v1Router.post(
"/map",
authMiddleware(RateLimiterMode.Map),
@ -136,6 +146,13 @@ v1Router.get(
wrap(crawlStatusController)
);
v1Router.get(
"/batch/scrape/:jobId",
authMiddleware(RateLimiterMode.CrawlStatus),
// Yes, it uses the same controller as the normal crawl status controller
wrap((req:any, res):any => crawlStatusController(req, res, true))
);
v1Router.get(
"/scrape/:jobId",
wrap(scrapeStatusController)

View File

@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
import { Logger } from "../../../src/lib/logger";
import https from "https";
export class WebCrawler {
private jobId: string;
private initialUrl: string;
@ -136,13 +136,23 @@ export class WebCrawler {
return false;
}
if (this.isFile(link)) {
return false;
}
return true;
})
.slice(0, limit);
}
public async getRobotsTxt(): Promise<string> {
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
let extraArgs = {};
if(skipTlsVerification) {
extraArgs["httpsAgent"] = new https.Agent({
rejectUnauthorized: false
});
}
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
return response.data;
}
@ -478,7 +488,14 @@ export class WebCrawler {
".webp",
".inc"
];
return fileExtensions.some((ext) => url.toLowerCase().endsWith(ext));
try {
const urlWithoutQuery = url.split('?')[0].toLowerCase();
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
} catch (error) {
Logger.error(`Error processing URL in isFile: ${error}`);
return false;
}
}
private isSocialMediaOrEmail(url: string): boolean {

View File

@ -593,6 +593,8 @@ export class WebScraperDataProvider {
disableJsDom: options.pageOptions?.disableJsDom ?? false,
atsv: options.pageOptions?.atsv ?? false,
actions: options.pageOptions?.actions ?? undefined,
geolocation: options.pageOptions?.geolocation ?? undefined,
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
};
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths =

View File

@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
waitFor = 0,
screenshot = false,
fullPageScreenshot = false,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
fireEngineOptions = {},
headers,
options,
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
waitFor?: number;
screenshot?: boolean;
fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
@ -118,6 +118,8 @@ export async function scrapWithFireEngine({
...fireEngineOptionsParam,
atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [],
geolocation: pageOptions?.geolocation,
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
actions: actions,
},
{

View File

@ -156,6 +156,8 @@ export async function scrapSingleUrl(
disableJsDom: pageOptions.disableJsDom ?? false,
atsv: pageOptions.atsv ?? false,
actions: pageOptions.actions ?? undefined,
geolocation: pageOptions.geolocation ?? undefined,
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
}
if (extractorOptions) {
@ -207,14 +209,15 @@ export async function scrapSingleUrl(
if (action.type === "click" || action.type === "write" || action.type === "press") {
const result: Action[] = [];
// Don't add a wait if the previous action is a wait
if (index === 0 || array[index - 1].type !== "wait") {
result.push({ type: "wait", milliseconds: 1200 } as Action);
}
// if (index === 0 || array[index - 1].type !== "wait") {
// result.push({ type: "wait", milliseconds: 1200 } as Action);
// }
// Fire-engine now handles wait times automatically, leaving the code here for now
result.push(action);
// Don't add a wait if the next action is a wait
if (index === array.length - 1 || array[index + 1].type !== "wait") {
result.push({ type: "wait", milliseconds: 1200 } as Action);
}
// if (index === array.length - 1 || array[index + 1].type !== "wait") {
// result.push({ type: "wait", milliseconds: 1200 } as Action);
// }
return result;
}
return [action as Action];

View File

@ -3,10 +3,8 @@ export const excludeNonMainTags = [
"footer",
"nav",
"aside",
".header",
".top",
".navbar",
"#header",
".footer",
".bottom",
"#footer",
@ -39,8 +37,6 @@ export const excludeNonMainTags = [
"#search",
".share",
"#share",
".widget",
"#widget",
".cookie",
"#cookie"
];

View File

@ -34,6 +34,7 @@ interface Metadata {
sourceURL?: string;
pageStatusCode?: number;
pageError?: string;
[key: string]: string | string[] | number | undefined;
}
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
@ -70,40 +71,78 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
let pageStatusCode: number | null = null;
let pageError: string | null = null;
const customMetadata: Record<string, string | string[]> = {};
try {
// TODO: remove this as it is redundant with the below implementation
title = soup("title").text() || null;
description = soup('meta[name="description"]').attr("content") || null;
// Assuming the language is part of the URL as per the regex pattern
language = soup('html').attr('lang') || null;
language = soup("html").attr("lang") || null;
keywords = soup('meta[name="keywords"]').attr("content") || null;
robots = soup('meta[name="robots"]').attr("content") || null;
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
ogDescription = soup('meta[property="og:description"]').attr("content") || null;
ogDescription =
soup('meta[property="og:description"]').attr("content") || null;
ogUrl = soup('meta[property="og:url"]').attr("content") || null;
ogImage = soup('meta[property="og:image"]').attr("content") || null;
ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null;
ogDeterminer =
soup('meta[property="og:determiner"]').attr("content") || null;
ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null;
ogLocaleAlternate =
soup('meta[property="og:locale:alternate"]')
.map((i, el) => soup(el).attr("content"))
.get() || null;
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
ogVideo = soup('meta[property="og:video"]').attr("content") || null;
articleSection = soup('meta[name="article:section"]').attr("content") || null;
articleSection =
soup('meta[name="article:section"]').attr("content") || null;
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
publishedTime = soup('meta[property="article:published_time"]').attr("content") || null;
modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null;
dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null;
publishedTime =
soup('meta[property="article:published_time"]').attr("content") || null;
modifiedTime =
soup('meta[property="article:modified_time"]').attr("content") || null;
dctermsKeywords =
soup('meta[name="dcterms.keywords"]').attr("content") || null;
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
dctermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || null;
dctermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || null;
dctermsSubject =
soup('meta[name="dcterms.subject"]').attr("content") || null;
dctermsAudience =
soup('meta[name="dcterms.audience"]').attr("content") || null;
dcType = soup('meta[name="dc.type"]').attr("content") || null;
dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
dcDate = soup('meta[name="dc.date"]').attr("content") || null;
dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null;
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
dcDateCreated =
soup('meta[name="dc.date.created"]').attr("content") || null;
dctermsCreated =
soup('meta[name="dcterms.created"]').attr("content") || null;
try {
// Extract all meta tags for custom metadata
soup("meta").each((i, elem) => {
try {
const name = soup(elem).attr("name") || soup(elem).attr("property");
const content = soup(elem).attr("content");
if (name && content) {
if (customMetadata[name] === undefined) {
customMetadata[name] = content;
} else if (Array.isArray(customMetadata[name])) {
(customMetadata[name] as string[]).push(content);
} else {
customMetadata[name] = [customMetadata[name] as string, content];
}
}
} catch (error) {
Logger.error(`Error extracting custom metadata (in): ${error}`);
}
});
} catch (error) {
Logger.error(`Error extracting custom metadata: ${error}`);
}
} catch (error) {
Logger.error(`Error extracting metadata: ${error}`);
}
@ -141,5 +180,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
...(sourceURL ? { sourceURL } : {}),
...(pageStatusCode ? { pageStatusCode } : {}),
...(pageError ? { pageError } : {}),
...customMetadata,
};
}

View File

@ -1,5 +1,5 @@
import axios, { AxiosResponse } from "axios";
import fs from "fs";
import fs from "fs/promises";
import { createReadStream, createWriteStream } from "node:fs";
import FormData from "form-data";
import dotenv from "dotenv";
@ -15,7 +15,7 @@ export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promis
try {
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
const content = await processPdfToText(tempFilePath, parsePDF);
fs.unlinkSync(tempFilePath); // Clean up the temporary file
await fs.unlink(tempFilePath); // Clean up the temporary file
return { content, pageStatusCode, pageError };
} catch (error) {
Logger.error(`Failed to fetch and process PDF: ${error.message}`);
@ -120,7 +120,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
}
} else {
try {
content = fs.readFileSync(filePath, "utf-8");
content = await fs.readFile(filePath, "utf-8");
} catch (error) {
Logger.error(`Failed to read PDF file: ${error}`);
content = "";
@ -131,7 +131,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
async function processPdf(file: string) {
try {
const fileContent = fs.readFileSync(file);
const fileContent = await fs.readFile(file);
const data = await pdf(fileContent);
return data.text;
} catch (error) {

View File

@ -6,6 +6,7 @@ import { Logger } from "../lib/logger";
dotenv.config();
export async function fireEngineMap(
q: string,
options: {
@ -41,11 +42,12 @@ export async function fireEngineMap(
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
headers: {
"Content-Type": "application/json",
"X-Disable-Cache": "true"
},
data: data,
};
const response = await axios(config);
if (response && response) {
if (response && response.data) {
return response.data;
} else {
return [];

View File

@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger";
import { SearchResult } from "../../src/lib/entities";
import { googleSearch } from "./googlesearch";
import { fireEngineMap } from "./fireEngine";
import { searchapi_search } from "./searchapi";
import { serper_search } from "./serper";
export async function search({
@ -30,7 +31,16 @@ export async function search({
timeout?: number;
}): Promise<SearchResult[]> {
try {
if (process.env.SEARCHAPI_API_KEY) {
return await searchapi_search(query, {
num_results,
tbs,
filter,
lang,
country,
location
});
}
if (process.env.SERPER_API_KEY) {
return await serper_search(query, {
num_results,

View File

@ -0,0 +1,60 @@
import axios from "axios";
import dotenv from "dotenv";
import { SearchResult } from "../../src/lib/entities";
dotenv.config();
interface SearchOptions {
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
num_results: number;
page?: number;
}
export async function searchapi_search(q: string, options: SearchOptions): Promise<SearchResult[]> {
const params = {
q: q,
hl: options.lang,
gl: options.country,
location: options.location,
num: options.num_results,
page: options.page ?? 1,
engine: process.env.SEARCHAPI_ENGINE || "google",
};
const url = `https://www.searchapi.io/api/v1/search`;
try {
const response = await axios.get(url, {
headers: {
"Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`,
"Content-Type": "application/json",
"X-SearchApi-Source": "Firecrawl",
},
params: params,
});
if (response.status === 401) {
throw new Error("Unauthorized. Please check your API key.");
}
const data = response.data;
if (data && Array.isArray(data.organic_results)) {
return data.organic_results.map((a: any) => ({
url: a.link,
title: a.title,
description: a.snippet,
}));
} else {
return [];
}
} catch (error) {
console.error(`There was an error searching for content: ${error.message}`);
return [];
}
}

View File

@ -0,0 +1,176 @@
// Import necessary dependencies and types
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
import { getACUC, setCachedACUC } from "../../controllers/auth";
import { redlock } from "../redlock";
import { supabase_service } from "../supabase";
import { createPaymentIntent } from "./stripe";
import { issueCredits } from "./issue_credits";
import { sendNotification } from "../notification/email_notification";
import { NotificationType } from "../../types";
import { deleteKey, getValue, setValue } from "../redis";
import { sendSlackWebhook } from "../alerts/slack";
import { Logger } from "../../lib/logger";
// Define the number of credits to be added during auto-recharge
const AUTO_RECHARGE_CREDITS = 1000;
const AUTO_RECHARGE_COOLDOWN = 300; // 5 minutes in seconds
/**
* Attempt to automatically charge a user's account when their credit balance falls below a threshold
* @param chunk The user's current usage data
* @param autoRechargeThreshold The credit threshold that triggers auto-recharge
*/
export async function autoCharge(
chunk: AuthCreditUsageChunk,
autoRechargeThreshold: number
): Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> {
const resource = `auto-recharge:${chunk.team_id}`;
const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`;
try {
// Check if the team is in the cooldown period
// Another check to prevent race conditions, double charging - cool down of 5 minutes
const cooldownValue = await getValue(cooldownKey);
if (cooldownValue) {
Logger.info(`Auto-recharge for team ${chunk.team_id} is in cooldown period`);
return {
success: false,
message: "Auto-recharge is in cooldown period",
remainingCredits: chunk.remaining_credits,
chunk,
};
}
// Use a distributed lock to prevent concurrent auto-charge attempts
return await redlock.using([resource], 5000, async (signal) : Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> => {
// Recheck the condition inside the lock to prevent race conditions
const updatedChunk = await getACUC(chunk.api_key, false, false);
if (
updatedChunk &&
updatedChunk.remaining_credits < autoRechargeThreshold
) {
if (chunk.sub_user_id) {
// Fetch the customer's Stripe information
const { data: customer, error: customersError } =
await supabase_service
.from("customers")
.select("id, stripe_customer_id")
.eq("id", chunk.sub_user_id)
.single();
if (customersError) {
Logger.error(`Error fetching customer data: ${customersError}`);
return {
success: false,
message: "Error fetching customer data",
remainingCredits: chunk.remaining_credits,
chunk,
};
}
if (customer && customer.stripe_customer_id) {
let issueCreditsSuccess = false;
// Attempt to create a payment intent
const paymentStatus = await createPaymentIntent(
chunk.team_id,
customer.stripe_customer_id
);
// If payment is successful or requires further action, issue credits
if (
paymentStatus.return_status === "succeeded" ||
paymentStatus.return_status === "requires_action"
) {
issueCreditsSuccess = await issueCredits(
chunk.team_id,
AUTO_RECHARGE_CREDITS
);
}
// Record the auto-recharge transaction
await supabase_service.from("auto_recharge_transactions").insert({
team_id: chunk.team_id,
initial_payment_status: paymentStatus.return_status,
credits_issued: issueCreditsSuccess ? AUTO_RECHARGE_CREDITS : 0,
stripe_charge_id: paymentStatus.charge_id,
});
// Send a notification if credits were successfully issued
if (issueCreditsSuccess) {
await sendNotification(
chunk.team_id,
NotificationType.AUTO_RECHARGE_SUCCESS,
chunk.sub_current_period_start,
chunk.sub_current_period_end,
chunk,
true
);
// Set cooldown period
await setValue(cooldownKey, 'true', AUTO_RECHARGE_COOLDOWN);
}
// Reset ACUC cache to reflect the new credit balance
const cacheKeyACUC = `acuc_${chunk.api_key}`;
await deleteKey(cacheKeyACUC);
if (process.env.SLACK_ADMIN_WEBHOOK_URL) {
const webhookCooldownKey = `webhook_cooldown_${chunk.team_id}`;
const isInCooldown = await getValue(webhookCooldownKey);
if (!isInCooldown) {
sendSlackWebhook(
`Auto-recharge: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}.`,
false,
process.env.SLACK_ADMIN_WEBHOOK_URL
).catch((error) => {
Logger.debug(`Error sending slack notification: ${error}`);
});
// Set cooldown for 1 hour
await setValue(webhookCooldownKey, 'true', 60 * 60);
}
}
return {
success: true,
message: "Auto-recharge successful",
remainingCredits: chunk.remaining_credits + AUTO_RECHARGE_CREDITS,
chunk: {...chunk, remaining_credits: chunk.remaining_credits + AUTO_RECHARGE_CREDITS},
};
} else {
Logger.error("No Stripe customer ID found for user");
return {
success: false,
message: "No Stripe customer ID found for user",
remainingCredits: chunk.remaining_credits,
chunk,
};
}
} else {
Logger.error("No sub_user_id found in chunk");
return {
success: false,
message: "No sub_user_id found in chunk",
remainingCredits: chunk.remaining_credits,
chunk,
};
}
}
return {
success: false,
message: "No need to auto-recharge",
remainingCredits: chunk.remaining_credits,
chunk,
};
});
} catch (error) {
Logger.error(`Failed to acquire lock for auto-recharge: ${error}`);
return {
success: false,
message: "Failed to acquire lock for auto-recharge",
remainingCredits: chunk.remaining_credits,
chunk,
};
}
}

View File

@ -6,24 +6,40 @@ import { Logger } from "../../lib/logger";
import * as Sentry from "@sentry/node";
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
import { getACUC, setCachedACUC } from "../../controllers/auth";
import { issueCredits } from "./issue_credits";
import { redlock } from "../redlock";
import { autoCharge } from "./auto_charge";
import { getValue, setValue } from "../redis";
const FREE_CREDITS = 500;
/**
* If you do not know the subscription_id in the current context, pass subscription_id as undefined.
*/
export async function billTeam(team_id: string, subscription_id: string | null | undefined, credits: number) {
export async function billTeam(
team_id: string,
subscription_id: string | null | undefined,
credits: number
) {
return withAuth(supaBillTeam)(team_id, subscription_id, credits);
}
export async function supaBillTeam(team_id: string, subscription_id: string, credits: number) {
export async function supaBillTeam(
team_id: string,
subscription_id: string,
credits: number
) {
if (team_id === "preview") {
return { success: true, message: "Preview team, no credits used" };
}
Logger.info(`Billing team ${team_id} for ${credits} credits`);
const { data, error } =
await supabase_service.rpc("bill_team", { _team_id: team_id, sub_id: subscription_id ?? null, fetch_subscription: subscription_id === undefined, credits });
const { data, error } = await supabase_service.rpc("bill_team", {
_team_id: team_id,
sub_id: subscription_id ?? null,
fetch_subscription: subscription_id === undefined,
credits,
});
if (error) {
Sentry.captureException(error);
Logger.error("Failed to bill team: " + JSON.stringify(error));
@ -31,53 +47,126 @@ export async function supaBillTeam(team_id: string, subscription_id: string, cre
}
(async () => {
for (const apiKey of (data ?? []).map(x => x.api_key)) {
await setCachedACUC(apiKey, acuc => (acuc ? {
...acuc,
credits_used: acuc.credits_used + credits,
adjusted_credits_used: acuc.adjusted_credits_used + credits,
remaining_credits: acuc.remaining_credits - credits,
} : null));
for (const apiKey of (data ?? []).map((x) => x.api_key)) {
await setCachedACUC(apiKey, (acuc) =>
acuc
? {
...acuc,
credits_used: acuc.credits_used + credits,
adjusted_credits_used: acuc.adjusted_credits_used + credits,
remaining_credits: acuc.remaining_credits - credits,
}
: null
);
}
})();
}
export async function checkTeamCredits(chunk: AuthCreditUsageChunk, team_id: string, credits: number) {
return withAuth(supaCheckTeamCredits)(chunk, team_id, credits);
export async function checkTeamCredits(
chunk: AuthCreditUsageChunk,
team_id: string,
credits: number
): Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> {
const result = await withAuth(supaCheckTeamCredits)(chunk, team_id, credits);
return {
success: result.success,
message: result.message,
remainingCredits: result.remainingCredits,
chunk: chunk // Ensure chunk is always returned
};
}
// if team has enough credits for the operation, return true, else return false
export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: string, credits: number) {
export async function supaCheckTeamCredits(
chunk: AuthCreditUsageChunk,
team_id: string,
credits: number
) {
// WARNING: chunk will be null if team_id is preview -- do not perform operations on it under ANY circumstances - mogery
if (team_id === "preview") {
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
return {
success: true,
message: "Preview team, no credits used",
remainingCredits: Infinity,
};
}
const creditsWillBeUsed = chunk.adjusted_credits_used + credits;
// In case chunk.price_credits is undefined, set it to a large number to avoid mistakes
const totalPriceCredits = chunk.total_credits_sum ?? 100000000;
// Removal of + credits
const creditUsagePercentage = creditsWillBeUsed / chunk.price_credits;
const creditUsagePercentage = chunk.adjusted_credits_used / totalPriceCredits;
let isAutoRechargeEnabled = false, autoRechargeThreshold = 1000;
const cacheKey = `team_auto_recharge_${team_id}`;
let cachedData = await getValue(cacheKey);
if (cachedData) {
const parsedData = JSON.parse(cachedData);
isAutoRechargeEnabled = parsedData.auto_recharge;
autoRechargeThreshold = parsedData.auto_recharge_threshold;
} else {
const { data, error } = await supabase_service
.from("teams")
.select("auto_recharge, auto_recharge_threshold")
.eq("id", team_id)
.single();
if (data) {
isAutoRechargeEnabled = data.auto_recharge;
autoRechargeThreshold = data.auto_recharge_threshold;
await setValue(cacheKey, JSON.stringify(data), 300); // Cache for 5 minutes (300 seconds)
}
}
if (isAutoRechargeEnabled && chunk.remaining_credits < autoRechargeThreshold) {
const autoChargeResult = await autoCharge(chunk, autoRechargeThreshold);
if (autoChargeResult.success) {
return {
success: true,
message: autoChargeResult.message,
remainingCredits: autoChargeResult.remainingCredits,
chunk: autoChargeResult.chunk,
};
}
}
// Compare the adjusted total credits used with the credits allowed by the plan
if (creditsWillBeUsed > chunk.price_credits) {
sendNotification(
team_id,
NotificationType.LIMIT_REACHED,
chunk.sub_current_period_start,
chunk.sub_current_period_end
);
return { success: false, message: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk };
if (creditsWillBeUsed > totalPriceCredits) {
// Only notify if their actual credits (not what they will use) used is greater than the total price credits
if (chunk.adjusted_credits_used > totalPriceCredits) {
sendNotification(
team_id,
NotificationType.LIMIT_REACHED,
chunk.sub_current_period_start,
chunk.sub_current_period_end,
chunk
);
}
return {
success: false,
message:
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.",
remainingCredits: chunk.remaining_credits,
chunk,
};
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
// Send email notification for approaching credit limit
sendNotification(
team_id,
NotificationType.APPROACHING_LIMIT,
chunk.sub_current_period_start,
chunk.sub_current_period_end
chunk.sub_current_period_end,
chunk
);
}
return { success: true, message: "Sufficient credits available", remainingCredits: chunk.remaining_credits, chunk };
return {
success: true,
message: "Sufficient credits available",
remainingCredits: chunk.remaining_credits,
chunk,
};
}
// Count the total credits used by a team within the current billing period and return the remaining credits.

View File

@ -0,0 +1,20 @@
import { Logger } from "../../lib/logger";
import { supabase_service } from "../supabase";
export async function issueCredits(team_id: string, credits: number) {
// Add an entry to supabase coupons
const { data, error } = await supabase_service.from("coupons").insert({
team_id: team_id,
credits: credits,
status: "active",
// indicates that this coupon was issued from auto recharge
from_auto_recharge: true,
});
if (error) {
Logger.error(`Error adding coupon: ${error}`);
return false;
}
return true;
}

View File

@ -0,0 +1,56 @@
import { Logger } from "../../lib/logger";
import Stripe from "stripe";
const stripe = new Stripe(process.env.STRIPE_SECRET_KEY ?? "");
async function getCustomerDefaultPaymentMethod(customerId: string) {
const paymentMethods = await stripe.customers.listPaymentMethods(customerId, {
limit: 3,
});
return paymentMethods.data[0] ?? null;
}
type ReturnStatus = "succeeded" | "requires_action" | "failed";
export async function createPaymentIntent(
team_id: string,
customer_id: string
): Promise<{ return_status: ReturnStatus; charge_id: string }> {
try {
const defaultPaymentMethod = await getCustomerDefaultPaymentMethod(customer_id);
if (!defaultPaymentMethod) {
Logger.error(`No default payment method found for customer: ${customer_id}`);
return { return_status: "failed", charge_id: "" };
}
const paymentIntent = await stripe.paymentIntents.create({
amount: 1100,
currency: "usd",
customer: customer_id,
description: "Firecrawl: Auto re-charge of 1000 credits",
payment_method_types: [defaultPaymentMethod?.type ?? "card"],
payment_method: defaultPaymentMethod?.id,
off_session: true,
confirm: true,
});
if (paymentIntent.status === "succeeded") {
Logger.info(`Payment succeeded for team: ${team_id}`);
return { return_status: "succeeded", charge_id: paymentIntent.id };
} else if (
paymentIntent.status === "requires_action" ||
paymentIntent.status === "processing" ||
paymentIntent.status === "requires_capture"
) {
Logger.warn(`Payment requires further action for team: ${team_id}`);
return { return_status: "requires_action", charge_id: paymentIntent.id };
} else {
Logger.error(`Payment failed for team: ${team_id}`);
return { return_status: "failed", charge_id: paymentIntent.id };
}
} catch (error) {
Logger.error(
`Failed to create or confirm PaymentIntent for team: ${team_id}`
);
console.error(error);
return { return_status: "failed", charge_id: "" };
}
}

View File

@ -70,7 +70,9 @@ export async function logJob(job: FirecrawlJob) {
retry: job.retry,
},
};
posthog.capture(phLog);
if(job.mode !== "single_urls") {
posthog.capture(phLog);
}
}
if (error) {
Logger.error(`Error logging job: ${error.message}`);

View File

@ -3,6 +3,9 @@ import { withAuth } from "../../lib/withAuth";
import { Resend } from "resend";
import { NotificationType } from "../../types";
import { Logger } from "../../../src/lib/logger";
import { sendSlackWebhook } from "../alerts/slack";
import { getNotificationString } from "./notification_string";
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
const emailTemplates: Record<
NotificationType,
@ -21,25 +24,37 @@ const emailTemplates: Record<
subject: "Rate Limit Reached - Firecrawl",
html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
},
[NotificationType.AUTO_RECHARGE_SUCCESS]: {
subject: "Auto recharge successful - Firecrawl",
html: "Hey there,<br/><p>Your account was successfully recharged with 1000 credits because your remaining credits were below the threshold. Consider upgrading your plan at <a href='https://firecrawl.dev/pricing'>firecrawl.dev/pricing</a> to avoid hitting the limit.</p><br/>Thanks,<br/>Firecrawl Team<br/>",
},
[NotificationType.AUTO_RECHARGE_FAILED]: {
subject: "Auto recharge failed - Firecrawl",
html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
},
};
export async function sendNotification(
team_id: string,
notificationType: NotificationType,
startDateString: string,
endDateString: string
endDateString: string,
chunk: AuthCreditUsageChunk,
bypassRecentChecks: boolean = false
) {
return withAuth(sendNotificationInternal)(
team_id,
notificationType,
startDateString,
endDateString
endDateString,
chunk,
bypassRecentChecks
);
}
async function sendEmailNotification(
export async function sendEmailNotification(
email: string,
notificationType: NotificationType
notificationType: NotificationType,
) {
const resend = new Resend(process.env.RESEND_API_KEY);
@ -66,80 +81,95 @@ export async function sendNotificationInternal(
team_id: string,
notificationType: NotificationType,
startDateString: string,
endDateString: string
endDateString: string,
chunk: AuthCreditUsageChunk,
bypassRecentChecks: boolean = false
): Promise<{ success: boolean }> {
if (team_id === "preview") {
return { success: true };
}
const fifteenDaysAgo = new Date();
fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15);
if (!bypassRecentChecks) {
const fifteenDaysAgo = new Date();
fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15);
const { data, error } = await supabase_service
.from("user_notifications")
.select("*")
.eq("team_id", team_id)
.eq("notification_type", notificationType)
.gte("sent_date", fifteenDaysAgo.toISOString());
if (error) {
Logger.debug(`Error fetching notifications: ${error}`);
return { success: false };
}
if (data.length !== 0) {
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} in the last 15 days`);
return { success: false };
}
const { data: recentData, error: recentError } = await supabase_service
.from("user_notifications")
.select("*")
.eq("team_id", team_id)
.eq("notification_type", notificationType)
.gte("sent_date", startDateString)
.lte("sent_date", endDateString);
if (recentError) {
Logger.debug(`Error fetching recent notifications: ${recentError}`);
return { success: false };
}
if (recentData.length !== 0) {
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} within the specified date range`);
return { success: false };
} else {
console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`);
// get the emails from the user with the team_id
const { data: emails, error: emailsError } = await supabase_service
.from("users")
.select("email")
.eq("team_id", team_id);
if (emailsError) {
Logger.debug(`Error fetching emails: ${emailsError}`);
return { success: false };
}
for (const email of emails) {
await sendEmailNotification(email.email, notificationType);
}
const { error: insertError } = await supabase_service
const { data, error } = await supabase_service
.from("user_notifications")
.insert([
{
team_id: team_id,
notification_type: notificationType,
sent_date: new Date().toISOString(),
},
]);
.select("*")
.eq("team_id", team_id)
.eq("notification_type", notificationType)
.gte("sent_date", fifteenDaysAgo.toISOString());
if (insertError) {
Logger.debug(`Error inserting notification record: ${insertError}`);
if (error) {
Logger.debug(`Error fetching notifications: ${error}`);
return { success: false };
}
return { success: true };
if (data.length !== 0) {
return { success: false };
}
// TODO: observation: Free credits people are not receiving notifications
const { data: recentData, error: recentError } = await supabase_service
.from("user_notifications")
.select("*")
.eq("team_id", team_id)
.eq("notification_type", notificationType)
.gte("sent_date", startDateString)
.lte("sent_date", endDateString);
if (recentError) {
Logger.debug(`Error fetching recent notifications: ${recentError.message}`);
return { success: false };
}
if (recentData.length !== 0) {
return { success: false };
}
}
console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`);
// get the emails from the user with the team_id
const { data: emails, error: emailsError } = await supabase_service
.from("users")
.select("email")
.eq("team_id", team_id);
if (emailsError) {
Logger.debug(`Error fetching emails: ${emailsError}`);
return { success: false };
}
for (const email of emails) {
await sendEmailNotification(email.email, notificationType);
}
const { error: insertError } = await supabase_service
.from("user_notifications")
.insert([
{
team_id: team_id,
notification_type: notificationType,
sent_date: new Date().toISOString(),
},
]);
if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) {
sendSlackWebhook(
`${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}. Number of credits used: ${chunk.adjusted_credits_used} | Number of credits in the plan: ${chunk.price_credits}`,
false,
process.env.SLACK_ADMIN_WEBHOOK_URL
).catch((error) => {
Logger.debug(`Error sending slack notification: ${error}`);
});
}
if (insertError) {
Logger.debug(`Error inserting notification record: ${insertError}`);
return { success: false };
}
return { success: true };
}

View File

@ -0,0 +1,21 @@
import { NotificationType } from "../../types";
// depending on the notification type, return the appropriate string
export function getNotificationString(
notificationType: NotificationType
): string {
switch (notificationType) {
case NotificationType.APPROACHING_LIMIT:
return "Approaching the limit (80%)";
case NotificationType.LIMIT_REACHED:
return "Limit reached (100%)";
case NotificationType.RATE_LIMIT_REACHED:
return "Rate limit reached";
case NotificationType.AUTO_RECHARGE_SUCCESS:
return "Auto-recharge successful";
case NotificationType.AUTO_RECHARGE_FAILED:
return "Auto-recharge failed";
default:
return "Unknown notification type";
}
}

View File

@ -329,7 +329,8 @@ async function processJob(job: Job, token: string) {
job.id as string,
data,
job.data.webhook,
job.data.v1
job.data.v1,
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
);
}
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
@ -339,7 +340,7 @@ async function processJob(job: Job, token: string) {
data,
job.data.webhook,
job.data.v1,
"crawl.page",
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
true
);
}
@ -365,7 +366,7 @@ async function processJob(job: Job, token: string) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (!job.data.sitemapped) {
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
@ -415,8 +416,6 @@ async function processJob(job: Job, token: string) {
}
if (await finishCrawl(job.data.crawl_id)) {
if (!job.data.v1) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
@ -439,7 +438,7 @@ async function processJob(job: Job, token: string) {
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
url: sc.originUrl,
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
@ -469,7 +468,7 @@ async function processJob(job: Job, token: string) {
data,
job.data.webhook,
job.data.v1,
"crawl.completed"
job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed"
);
}
} else {
@ -487,7 +486,7 @@ async function processJob(job: Job, token: string) {
[],
job.data.webhook,
job.data.v1,
"crawl.completed"
job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed"
);
}
@ -499,8 +498,8 @@ async function processJob(job: Job, token: string) {
docs: [],
time_taken: (Date.now() - sc.createdAt) / 1000,
team_id: job.data.team_id,
mode: "crawl",
url: sc.originUrl,
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"),
crawlerOptions: sc.crawlerOptions,
pageOptions: sc.pageOptions,
origin: job.data.origin,
@ -556,7 +555,8 @@ async function processJob(job: Job, token: string) {
job.data.crawl_id ?? (job.id as string),
data,
job.data.webhook,
job.data.v1
job.data.v1,
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
);
}
// if (job.data.v1) {
@ -605,7 +605,7 @@ async function processJob(job: Job, token: string) {
docs: [],
time_taken: 0,
team_id: job.data.team_id,
mode: "crawl",
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
url: sc ? sc.originUrl : job.data.url,
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,

View File

@ -130,6 +130,8 @@ export enum NotificationType {
APPROACHING_LIMIT = "approachingLimit",
LIMIT_REACHED = "limitReached",
RATE_LIMIT_REACHED = "rateLimitReached",
AUTO_RECHARGE_SUCCESS = "autoRechargeSuccess",
AUTO_RECHARGE_FAILED = "autoRechargeFailed",
}
export type ScrapeLog = {
@ -159,4 +161,4 @@ export type PlanType =
| "";
export type WebhookEventType = "crawl.page" | "crawl.started" | "crawl.completed" | "crawl.failed";
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";

View File

@ -6,7 +6,7 @@
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
"contact": {
"name": "Firecrawl Support",
"url": "https://firecrawl.dev",
"url": "https://firecrawl.dev/support",
"email": "support@firecrawl.dev"
}
},
@ -97,6 +97,127 @@
"description": "The prompt to use for the extraction without a schema (Optional)"
}
}
},
"actions": {
"type": "array",
"description": "Actions to perform on the page before grabbing the content",
"items": {
"oneOf": [
{
"type": "object",
"title": "Wait",
"properties": {
"type": {
"type": "string",
"enum": ["wait"],
"description": "Wait for a specified amount of milliseconds"
},
"milliseconds": {
"type": "integer",
"minimum": 1,
"description": "Number of milliseconds to wait"
}
},
"required": ["type", "milliseconds"]
},
{
"type": "object",
"title": "Screenshot",
"properties": {
"type": {
"type": "string",
"enum": ["screenshot"],
"description": "Take a screenshot"
},
"fullPage": {
"type": "boolean",
"description": "Should the screenshot be full-page or viewport sized?",
"default": false
}
},
"required": ["type"]
},
{
"type": "object",
"title": "Click",
"properties": {
"type": {
"type": "string",
"enum": ["click"],
"description": "Click on an element"
},
"selector": {
"type": "string",
"description": "Query selector to find the element by",
"example": "#load-more-button"
}
},
"required": ["type", "selector"]
},
{
"type": "object",
"title": "Write text",
"properties": {
"type": {
"type": "string",
"enum": ["write"],
"description": "Write text into an input field"
},
"text": {
"type": "string",
"description": "Text to type",
"example": "Hello, world!"
},
"selector": {
"type": "string",
"description": "Query selector for the input field",
"example": "#search-input"
}
},
"required": ["type", "text", "selector"]
},
{
"type": "object",
"title": "Press a key",
"description": "Press a key on the page. See https://asawicki.info/nosense/doc/devices/keyboard/key_codes.html for key codes.",
"properties": {
"type": {
"type": "string",
"enum": ["press"],
"description": "Press a key on the page"
},
"key": {
"type": "string",
"description": "Key to press",
"example": "Enter"
}
},
"required": ["type", "key"]
},
{
"type": "object",
"title": "Scroll",
"properties": {
"type": {
"type": "string",
"enum": ["scroll"],
"description": "Scroll the page"
},
"direction": {
"type": "string",
"enum": ["up", "down"],
"description": "Direction to scroll"
},
"amount": {
"type": "integer",
"description": "Amount to scroll in pixels",
"minimum": 1
}
},
"required": ["type", "direction"]
}
]
}
}
},
"required": ["url"]
@ -341,14 +462,14 @@
"items": {
"type": "string"
},
"description": "URL patterns to exclude"
"description": "Specifies URL patterns to exclude from the crawl by comparing website paths against the provided regex patterns. For example, if you set \"excludePaths\": [\"blog/*\"] for the base URL firecrawl.dev, any results matching that pattern will be excluded, such as https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap."
},
"includePaths": {
"type": "array",
"items": {
"type": "string"
},
"description": "URL patterns to include"
"description": "Specifies URL patterns to include in the crawl by comparing website paths against the provided regex patterns. Only the paths that match the specified patterns will be included in the response. For example, if you set \"includePaths\": [\"blog/*\"] for the base URL firecrawl.dev, only results matching that pattern will be included, such as https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap."
},
"maxDepth": {
"type": "integer",
@ -362,7 +483,7 @@
},
"limit": {
"type": "integer",
"description": "Maximum number of pages to crawl",
"description": "Maximum number of pages to crawl. Default limit is 10000.",
"default": 10
},
"allowBackwardLinks": {
@ -513,7 +634,7 @@
},
"search": {
"type": "string",
"description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 100 search results. However, if map finds more results, there is no limit applied."
"description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 1000 search results. However, if map finds more results, there is no limit applied."
},
"ignoreSitemap": {
"type": "boolean",
@ -642,6 +763,21 @@
},
"description": "List of links on the page if `links` is in `formats`"
},
"actions": {
"type": "object",
"nullable": true,
"description": "Results of the actions specified in the `actions` parameter. Only present if the `actions` parameter was provided in the request",
"properties": {
"screenshots": {
"type": "array",
"description": "Screenshot URLs, in the same order as the screenshot actions provided.",
"items": {
"type": "string",
"format": "url"
}
}
}
},
"metadata": {
"type": "object",
"properties": {

View File

@ -145,6 +145,46 @@ watch.addEventListener("done", state => {
});
```
### Batch scraping multiple URLs
To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
```js
const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
formats: ['markdown', 'html'],
})
```
#### Asynchronous batch scrape
To initiate an asynchronous batch scrape, utilize the `asyncBatchScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape.
```js
const asyncBatchScrapeResult = await app.asyncBatchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
```
#### Batch scrape with WebSockets
To use batch scrape with WebSockets, use the `batchScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
```js
// Batch scrape multiple URLs with WebSockets:
const watch = await app.batchScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
watch.addEventListener("document", doc => {
console.log("DOC", doc.detail);
});
watch.addEventListener("error", err => {
console.error("ERR", err.detail.error);
});
watch.addEventListener("done", state => {
console.log("DONE", state.detail.status);
});
```
## Error Handling
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.

View File

@ -1,6 +1,6 @@
{
"name": "firecrawl",
"version": "1.6.1",
"name": "@mendable/firecrawl-js",
"version": "1.7.2",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@ -82,6 +82,10 @@ export interface CrawlScrapeOptions {
onlyMainContent?: boolean;
waitFor?: number;
timeout?: number;
location?: {
country?: string;
languages?: string[];
};
}
export type Action = {
@ -154,6 +158,17 @@ export interface CrawlResponse {
error?: string;
}
/**
* Response interface for batch scrape operations.
* Defines the structure of the response received after initiating a crawl.
*/
export interface BatchScrapeResponse {
id?: string;
url?: string;
success: true;
error?: string;
}
/**
* Response interface for job status checks.
* Provides detailed status of a crawl job including progress and results.
@ -169,6 +184,21 @@ export interface CrawlStatusResponse {
data: FirecrawlDocument<undefined>[];
};
/**
* Response interface for batch scrape job status checks.
* Provides detailed status of a batch scrape job including progress and results.
*/
export interface BatchScrapeStatusResponse {
success: true;
status: "scraping" | "completed" | "failed" | "cancelled";
completed: number;
total: number;
creditsUsed: number;
expiresAt: Date;
next?: string;
data: FirecrawlDocument<undefined>[];
};
/**
* Parameters for mapping operations.
* Defines options for mapping URLs during a crawl.
@ -493,6 +523,144 @@ export default class FirecrawlApp {
return { success: false, error: "Internal server error." };
}
/**
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
* @param url - The URLs to scrape.
* @param params - Additional parameters for the scrape request.
* @param pollInterval - Time in seconds for job status checks.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns The response from the crawl operation.
*/
async batchScrapeUrls(
urls: string[],
params?: ScrapeParams,
pollInterval: number = 2,
idempotencyKey?: string
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,
jsonData,
headers
);
if (response.status === 200) {
const id: string = response.data.id;
return this.monitorJobStatus(id, headers, pollInterval);
} else {
this.handleError(response, "start batch scrape job");
}
} catch (error: any) {
if (error.response?.data?.error) {
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
} else {
throw new FirecrawlError(error.message, 500);
}
}
return { success: false, error: "Internal server error." };
}
async asyncBatchScrapeUrls(
urls: string[],
params?: ScrapeParams,
idempotencyKey?: string
): Promise<BatchScrapeResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, ...(params ?? {}) };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,
jsonData,
headers
);
if (response.status === 200) {
return response.data;
} else {
this.handleError(response, "start batch scrape job");
}
} catch (error: any) {
if (error.response?.data?.error) {
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
} else {
throw new FirecrawlError(error.message, 500);
}
}
return { success: false, error: "Internal server error." };
}
/**
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
* @param urls - The URL to scrape.
* @param params - Additional parameters for the scrape request.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns A CrawlWatcher instance to monitor the crawl job.
*/
async batchScrapeUrlsAndWatch(
urls: string[],
params?: ScrapeParams,
idempotencyKey?: string,
) {
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
if (crawl.success && crawl.id) {
const id = crawl.id;
return new CrawlWatcher(id, this);
}
throw new FirecrawlError("Batch scrape job failed to start", 400);
}
/**
* Checks the status of a batch scrape job using the Firecrawl API.
* @param id - The ID of the batch scrape operation.
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
* @returns The response containing the job status.
*/
async checkBatchScrapeStatus(id?: string, getAllData = false): Promise<BatchScrapeStatusResponse | ErrorResponse> {
if (!id) {
throw new FirecrawlError("No batch scrape ID provided", 400);
}
const headers: AxiosRequestHeaders = this.prepareHeaders();
try {
const response: AxiosResponse = await this.getRequest(
`${this.apiUrl}/v1/batch/scrape/${id}`,
headers
);
if (response.status === 200) {
let allData = response.data.data;
if (getAllData && response.data.status === "completed") {
let statusData = response.data
if ("data" in statusData) {
let data = statusData.data;
while ('next' in statusData) {
statusData = (await this.getRequest(statusData.next, headers)).data;
data = data.concat(statusData.data);
}
allData = data;
}
}
return ({
success: response.data.success,
status: response.data.status,
total: response.data.total,
completed: response.data.completed,
creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt),
next: response.data.next,
data: allData,
error: response.data.error,
})
} else {
this.handleError(response, "check batch scrape status");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);
}
return { success: false, error: "Internal server error." };
}
/**
* Prepares the headers for an API request.
* @param idempotencyKey - Optional key to ensure idempotency.

View File

@ -9,7 +9,7 @@
"version": "1.0.0",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^1.0.3",
"@mendable/firecrawl-js": "^1.7.0-beta.2",
"axios": "^1.6.8",
"firecrawl": "^1.2.0",
"ts-node": "^10.9.2",
@ -423,31 +423,17 @@
}
},
"node_modules/@mendable/firecrawl-js": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz",
"integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==",
"version": "1.7.0-beta.2",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.7.0-beta.2.tgz",
"integrity": "sha512-6L5r6BOuMPjLgSDq85xs2IpVgX9Tb/EdesKZvmtFucoaFZzIsgCQb0ZfSvwaRmqTkj53o+7eSgCcm+gsnR/yeQ==",
"dependencies": {
"axios": "^1.6.8",
"dotenv": "^16.4.5",
"isows": "^1.0.4",
"typescript-event-target": "^1.1.1",
"uuid": "^9.0.1",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
}
},
"node_modules/@mendable/firecrawl-js/node_modules/uuid": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
"funding": [
"https://github.com/sponsors/broofa",
"https://github.com/sponsors/ctavan"
],
"bin": {
"uuid": "dist/bin/uuid"
}
},
"node_modules/@tsconfig/node10": {
"version": "1.0.11",
"resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz",

View File

@ -11,7 +11,7 @@
"author": "",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^1.0.3",
"@mendable/firecrawl-js": "1.7.1",
"axios": "^1.6.8",
"firecrawl": "^1.2.0",
"ts-node": "^10.9.2",

View File

@ -36,7 +36,6 @@ crawl_status = app.crawl_url(
'limit': 100,
'scrapeOptions': {'formats': ['markdown', 'html']}
},
wait_until_done=True,
poll_interval=30
)
print(crawl_status)
@ -150,6 +149,69 @@ async def start_crawl_and_watch():
await start_crawl_and_watch()
```
### Scraping multiple URLs in batch
To batch scrape multiple URLs, use the `batch_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats.
```python
idempotency_key = str(uuid.uuid4()) # optional idempotency key
batch_scrape_result = app.batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key)
print(batch_scrape_result)
```
### Asynchronous batch scrape
To run a batch scrape asynchronously, use the `async_batch_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
```python
batch_scrape_result = app.async_batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
print(batch_scrape_result)
```
### Checking batch scrape status
To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_status` method. It takes the job ID as a parameter and returns the current status of the batch scrape job.
```python
id = batch_scrape_result['id']
status = app.check_batch_scrape_status(id)
```
### Batch scrape with WebSockets
To use batch scrape with WebSockets, use the `batch_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
```python
# inside an async function...
nest_asyncio.apply()
# Define event handlers
def on_document(detail):
print("DOC", detail)
def on_error(detail):
print("ERR", detail['error'])
def on_done(detail):
print("DONE", detail['status'])
# Function to start the crawl and watch process
async def start_crawl_and_watch():
# Initiate the crawl job and get the watcher
watcher = app.batch_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
# Add event listeners
watcher.add_event_listener("document", on_document)
watcher.add_event_listener("error", on_error)
watcher.add_event_listener("done", on_done)
# Start the watcher
await watcher.connect()
# Run the event loop
await start_crawl_and_watch()
```
## Error Handling
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.

View File

@ -9,6 +9,23 @@ app = FirecrawlApp(api_key="fc-")
scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown'])
# Test batch scrape
urls = ['https://example.com', 'https://docs.firecrawl.dev']
batch_scrape_params = {
'formats': ['markdown', 'html'],
}
# Synchronous batch scrape
batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
print("Synchronous Batch Scrape Result:")
print(batch_result['data'][0]['markdown'])
# Asynchronous batch scrape
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
print("\nAsynchronous Batch Scrape Result:")
print(async_batch_result)
# Crawl a website:
idempotency_key = str(uuid.uuid4()) # optional idempotency key
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp
__version__ = "1.3.0"
__version__ = "1.4.0"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -81,8 +81,10 @@ class FirecrawlApp:
response = response.json()
if response['success'] and 'data' in response:
return response['data']
else:
elif "error" in response:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
else:
raise Exception(f'Failed to scrape URL. Error: {response}')
else:
self._handle_error(response, 'scrape URL')
@ -117,7 +119,14 @@ class FirecrawlApp:
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Any: The crawl job ID or the crawl results if waiting until completion.
Dict[str, Any]: A dictionary containing the crawl results. The structure includes:
- 'success' (bool): Indicates if the crawl was successful.
- 'status' (str): The final status of the crawl job (e.g., 'completed').
- 'completed' (int): Number of scraped pages that completed.
- 'total' (int): Total number of scraped pages.
- 'creditsUsed' (int): Estimated number of API credits used for this crawl.
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires.
- 'data' (List[Dict]): List of all the scraped pages.
Raises:
Exception: If the crawl job initiation or monitoring fails.
@ -146,7 +155,10 @@ class FirecrawlApp:
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Dict[str, Any]: The response from the crawl initiation request.
Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes:
- 'success' (bool): Indicates if the crawl initiation was successful.
- 'id' (str): The unique identifier for the crawl job.
- 'url' (str): The URL to check the status of the crawl job.
"""
endpoint = f'/v1/crawl'
headers = self._prepare_headers(idempotency_key)
@ -236,7 +248,7 @@ class FirecrawlApp:
params (Optional[Dict[str, Any]]): Additional parameters for the map search.
Returns:
Any: The result of the map search, typically a dictionary containing mapping data.
List[str]: A list of URLs discovered during the map search.
"""
endpoint = f'/v1/map'
headers = self._prepare_headers()
@ -256,11 +268,130 @@ class FirecrawlApp:
response = response.json()
if response['success'] and 'links' in response:
return response
else:
elif 'error' in response:
raise Exception(f'Failed to map URL. Error: {response["error"]}')
else:
raise Exception(f'Failed to map URL. Error: {response}')
else:
self._handle_error(response, 'map')
def batch_scrape_urls(self, urls: list[str],
params: Optional[Dict[str, Any]] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None) -> Any:
"""
Initiate a batch scrape job for the specified URLs using the Firecrawl API.
Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
- 'success' (bool): Indicates if the batch scrape was successful.
- 'status' (str): The final status of the batch scrape job (e.g., 'completed').
- 'completed' (int): Number of scraped pages that completed.
- 'total' (int): Total number of scraped pages.
- 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
- 'data' (List[Dict]): List of all the scraped pages.
Raises:
Exception: If the batch scrape job initiation or monitoring fails.
"""
endpoint = f'/v1/batch/scrape'
headers = self._prepare_headers(idempotency_key)
json_data = {'urls': urls}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
id = response.json().get('id')
return self._monitor_job_status(id, headers, poll_interval)
else:
self._handle_error(response, 'start batch scrape job')
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
"""
Initiate a crawl job asynchronously.
Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
- 'success' (bool): Indicates if the batch scrape initiation was successful.
- 'id' (str): The unique identifier for the batch scrape job.
- 'url' (str): The URL to check the status of the batch scrape job.
"""
endpoint = f'/v1/batch/scrape'
headers = self._prepare_headers(idempotency_key)
json_data = {'urls': urls}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, 'start batch scrape job')
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
"""
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
Returns:
CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
"""
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
if crawl_response['success'] and 'id' in crawl_response:
return CrawlWatcher(crawl_response['id'], self)
else:
raise Exception("Batch scrape job failed to start")
def check_batch_scrape_status(self, id: str) -> Any:
"""
Check the status of a batch scrape job using the Firecrawl API.
Args:
id (str): The ID of the batch scrape job.
Returns:
Any: The status of the batch scrape job.
Raises:
Exception: If the status check request fails.
"""
endpoint = f'/v1/batch/scrape/{id}'
headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}{endpoint}', headers)
if response.status_code == 200:
data = response.json()
return {
'success': True,
'status': data.get('status'),
'total': data.get('total'),
'completed': data.get('completed'),
'creditsUsed': data.get('creditsUsed'),
'expiresAt': data.get('expiresAt'),
'next': data.get('next'),
'data': data.get('data'),
'error': data.get('error')
}
else:
self._handle_error(response, 'check batch scrape status')
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
"""
Prepare the headers for API requests.

View File

@ -0,0 +1,166 @@
import os
from firecrawl import FirecrawlApp
import json
from dotenv import load_dotenv
import anthropic
import agentops
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
# Load environment variables
load_dotenv()
# Retrieve API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
# Initialize the FirecrawlApp and OpenAI client
app = FirecrawlApp(api_key=firecrawl_api_key)
client = anthropic.Anthropic(api_key=anthropic_api_key)
# Find the page that most likely contains the objective
def find_relevant_page_via_map(objective, url, app, client):
try:
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
map_prompt = f"""
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
"""
print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
completion = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1000,
temperature=0,
system="You are an expert web crawler. Respond with the best search parameter.",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": map_prompt
}
]
}
]
)
map_search_parameter = completion.content[0].text
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
map_website = app.map_url(url, params={"search": map_search_parameter})
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
return map_website['links']
except Exception as e:
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
return None
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
def find_objective_in_top_pages(map_website, objective, app, client):
try:
# Get top 2 links from the map result
top_links = map_website[:2]
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
# Scrape the pages in batch
batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})
print(f"{Colors.GREEN}Batch page scraping completed successfully.{Colors.RESET}")
for scrape_result in batch_scrape_result['data']:
# Check if objective is met
check_prompt = f"""
Given the following scraped content and objective, determine if the objective is met.
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
If the objective is not met with confidence, respond with 'Objective not met'.
Objective: {objective}
Scraped content: {scrape_result['markdown']}
Remember:
1. Only return JSON if you are confident the objective is fully met.
2. Keep the JSON structure as simple and flat as possible.
3. Do not include any explanations or markdown formatting in your response.
"""
completion = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1000,
temperature=0,
system="You are an expert web crawler. Respond with the relevant information in JSON format.",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": check_prompt
}
]
}
]
)
result = completion.content[0].text
if result != "Objective not met":
print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
try:
return json.loads(result)
except json.JSONDecodeError:
print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
else:
print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
return None
except Exception as e:
print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
return None
# Main function to execute the process
def main():
# Get user input
url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
if not url.strip():
url = "https://www.firecrawl.dev/"
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
if not objective.strip():
objective = "find me the pricing plans"
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
# Find the relevant page
map_website = find_relevant_page_via_map(objective, url, app, client)
print(map_website)
if map_website:
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}")
# Find objective in top pages
result = find_objective_in_top_pages(map_website, objective, app, client)
if result:
print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
else:
print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
else:
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
if __name__ == "__main__":
agentops.init(os.getenv("AGENTOPS_API_KEY"))
main()

View File

@ -0,0 +1,150 @@
import os
from firecrawl import FirecrawlApp
import json
from dotenv import load_dotenv
import requests
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
# Load environment variables
load_dotenv()
# Retrieve API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
grok_api_key = os.getenv("GROK_API_KEY")
# Initialize the FirecrawlApp
app = FirecrawlApp(api_key=firecrawl_api_key)
# Function to make Grok API calls
def grok_completion(prompt):
url = "https://api.x.ai/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {grok_api_key}"
}
data = {
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": prompt
}
],
"model": "grok-beta",
"stream": False,
"temperature": 0
}
response = requests.post(url, headers=headers, json=data)
return response.json()['choices'][0]['message']['content']
# Find the page that most likely contains the objective
def find_relevant_page_via_map(objective, url, app):
try:
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
map_prompt = f"""
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
"""
print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
map_search_parameter = grok_completion(map_prompt)
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
print(f"{Colors.MAGENTA}{map_search_parameter}{Colors.RESET}")
map_website = app.map_url(url, params={"search": map_search_parameter})
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
print(f"{Colors.MAGENTA}{map_website}{Colors.RESET}")
return map_website["links"]
except Exception as e:
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
return None
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
def find_objective_in_top_pages(map_website, objective, app):
try:
print(f"{Colors.MAGENTA}{map_website}{Colors.RESET}")
# Get top 3 links from the map result
top_links = map_website[:3] if isinstance(map_website, list) else []
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
for link in top_links:
print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}")
# Scrape the page
scrape_result = app.scrape_url(link, params={'formats': ['markdown']})
print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}")
# Check if objective is met
check_prompt = f"""
Given the following scraped content and objective, determine if the objective is met.
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
If the objective is not met with confidence, respond with 'Objective not met'.
Objective: {objective}
Scraped content: {scrape_result['markdown']}
Remember:
1. Only return JSON if you are confident the objective is fully met.
2. Keep the JSON structure as simple and flat as possible.
3. Do not include any explanations or markdown formatting in your response.
"""
result = grok_completion(check_prompt)
print(f"{Colors.MAGENTA}{result}{Colors.RESET}")
if result != "Objective not met":
print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
try:
result = result.replace("```json", "").replace("```", "")
return json.loads(result)
except json.JSONDecodeError:
print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
else:
print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
return None
except Exception as e:
print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
return None
# Main function to execute the process
def main():
# Get user input
url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
# Find the relevant page
map_website = find_relevant_page_via_map(objective, url, app)
if map_website:
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}")
# Find objective in top pages
result = find_objective_in_top_pages(map_website, objective, app)
if result:
print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
else:
print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
else:
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,2 @@
OPENAI_API_KEY=
FIRECRAWL_API_KEY=

View File

@ -0,0 +1,37 @@
# Swarm Firecrawl Marketing Agent
A multi-agent system using [OpenAI Swarm](https://github.com/openai/swarm) for AI-powered marketing strategies using [Firecrawl](https://firecrawl.dev) for web scraping.
## Agents
1. User Interface: Manages user interactions
2. Website Scraper: Extracts clean LLM-ready content via Firecrawl API
3. Analyst: Provides marketing insights
4. Campaign Idea: Generates marketing campaign concepts
5. Copywriter: Creates compelling marketing copy
## Requirements
- [Firecrawl](https://firecrawl.dev) API key
- [OpenAI](https://platform.openai.com/api-keys) API key
## Setup
1. Install the required packages:
```
pip install -r requirements.txt
```
2. Set up your environment variables in a `.env` file:
```
OPENAI_API_KEY=your_openai_api_key
FIRECRAWL_API_KEY=your_firecrawl_api_key
```
## Usage
Run the main script to start the interactive demo:
```
python main.py
```

View File

@ -0,0 +1,108 @@
import os
from firecrawl import FirecrawlApp
from swarm import Agent
from swarm.repl import run_demo_loop
import dotenv
from openai import OpenAI
dotenv.load_dotenv()
# Initialize FirecrawlApp and OpenAI
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def scrape_website(url):
"""Scrape a website using Firecrawl."""
scrape_status = app.scrape_url(
url,
params={'formats': ['markdown']}
)
return scrape_status
def generate_completion(role, task, content):
"""Generate a completion using OpenAI."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": f"You are a {role}. {task}"},
{"role": "user", "content": content}
]
)
return response.choices[0].message.content
def analyze_website_content(content):
"""Analyze the scraped website content using OpenAI."""
analysis = generate_completion(
"marketing analyst",
"Analyze the following website content and provide key insights for marketing strategy.",
content
)
return {"analysis": analysis}
def generate_copy(brief):
"""Generate marketing copy based on a brief using OpenAI."""
copy = generate_completion(
"copywriter",
"Create compelling marketing copy based on the following brief.",
brief
)
return {"copy": copy}
def create_campaign_idea(target_audience, goals):
"""Create a campaign idea based on target audience and goals using OpenAI."""
campaign_idea = generate_completion(
"marketing strategist",
"Create an innovative campaign idea based on the target audience and goals provided.",
f"Target Audience: {target_audience}\nGoals: {goals}"
)
return {"campaign_idea": campaign_idea}
def handoff_to_copywriter():
"""Hand off the campaign idea to the copywriter agent."""
return copywriter_agent
def handoff_to_analyst():
"""Hand off the website content to the analyst agent."""
return analyst_agent
def handoff_to_campaign_idea():
"""Hand off the target audience and goals to the campaign idea agent."""
return campaign_idea_agent
def handoff_to_website_scraper():
"""Hand off the url to the website scraper agent."""
return website_scraper_agent
user_interface_agent = Agent(
name="User Interface Agent",
instructions="You are a user interface agent that handles all interactions with the user. You need to always start with a URL that the user wants to create a marketing strategy for. Ask clarification questions if needed. Be concise.",
functions=[handoff_to_website_scraper],
)
website_scraper_agent = Agent(
name="Website Scraper Agent",
instructions="You are a website scraper agent specialized in scraping website content.",
functions=[scrape_website, handoff_to_analyst],
)
analyst_agent = Agent(
name="Analyst Agent",
instructions="You are an analyst agent that examines website content and provides insights for marketing strategies. Be concise.",
functions=[analyze_website_content, handoff_to_campaign_idea],
)
campaign_idea_agent = Agent(
name="Campaign Idea Agent",
instructions="You are a campaign idea agent that creates innovative marketing campaign ideas based on website content and target audience. Be concise.",
functions=[create_campaign_idea, handoff_to_copywriter],
)
copywriter_agent = Agent(
name="Copywriter Agent",
instructions="You are a copywriter agent specialized in creating compelling marketing copy based on website content and campaign ideas. Be concise.",
functions=[generate_copy],
)
if __name__ == "__main__":
# Run the demo loop with the user interface agent
run_demo_loop(user_interface_agent, stream=True)

View File

@ -0,0 +1,2 @@
firecrawl-py
openai

View File

@ -0,0 +1,3 @@
OPENAI_API_KEY=
FIRECRAWL_API_KEY=
SERP_API_KEY=

View File

@ -0,0 +1,120 @@
import os
from firecrawl import FirecrawlApp
from swarm import Agent
from swarm.repl import run_demo_loop
import dotenv
from serpapi import GoogleSearch
from openai import OpenAI
dotenv.load_dotenv()
# Initialize FirecrawlApp and OpenAI
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def search_google(query, objective):
"""Search Google using SerpAPI."""
print(f"Parameters: query={query}, objective={objective}")
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
results = search.get_dict().get("organic_results", [])
return {"objective": objective, "results": results}
def map_url_pages(url, objective):
"""Map a website's pages using Firecrawl."""
search_query = generate_completion(
"website search query generator",
f"Generate a 1-2 word search query for the website: {url} based on the objective",
"Objective: " + objective
)
print(f"Parameters: url={url}, objective={objective}, search_query={search_query}")
map_status = app.map_url(url, params={'search': search_query})
if map_status.get('status') == 'success':
links = map_status.get('links', [])
top_link = links[0] if links else None
return {"objective": objective, "results": [top_link] if top_link else []}
else:
return {"objective": objective, "results": []}
def scrape_url(url, objective):
"""Scrape a website using Firecrawl."""
print(f"Parameters: url={url}, objective={objective}")
scrape_status = app.scrape_url(
url,
params={'formats': ['markdown']}
)
return {"objective": objective, "results": scrape_status}
def analyze_website_content(content, objective):
"""Analyze the scraped website content using OpenAI."""
print(f"Parameters: content={content[:50]}..., objective={objective}")
analysis = generate_completion(
"website data extractor",
f"Analyze the following website content and extract a JSON object based on the objective.",
"Objective: " + objective + "\nContent: " + content
)
return {"objective": objective, "results": analysis}
def generate_completion(role, task, content):
"""Generate a completion using OpenAI."""
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": f"You are a {role}. {task}"},
{"role": "user", "content": content}
]
)
return response.choices[0].message.content
def handoff_to_search_google():
"""Hand off the search query to the search google agent."""
return google_search_agent
def handoff_to_map_url():
"""Hand off the url to the map url agent."""
return map_url_agent
def handoff_to_website_scraper():
"""Hand off the url to the website scraper agent."""
return website_scraper_agent
def handoff_to_analyst():
"""Hand off the website content to the analyst agent."""
return analyst_agent
user_interface_agent = Agent(
name="User Interface Agent",
instructions="You are a user interface agent that handles all interactions with the user. You need to always start with an web data extraction objective that the user wants to achieve by searching the web, mapping the web pages, and extracting the content from a specific page. Be concise.",
functions=[handoff_to_search_google],
)
google_search_agent = Agent(
name="Google Search Agent",
instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the map agent.",
functions=[search_google, handoff_to_map_url],
)
map_url_agent = Agent(
name="Map URL Agent",
instructions="You are a map url agent specialized in mapping the web pages. When you are done, you must hand off the results to the website scraper agent.",
functions=[map_url_pages, handoff_to_website_scraper],
)
website_scraper_agent = Agent(
name="Website Scraper Agent",
instructions="You are a website scraper agent specialized in scraping website content. When you are done, you must hand off the website content to the analyst agent to extract the data based on the objective.",
functions=[scrape_url, handoff_to_analyst],
)
analyst_agent = Agent(
name="Analyst Agent",
instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must return a JSON object.",
functions=[analyze_website_content],
)
if __name__ == "__main__":
# Run the demo loop with the user interface agent
run_demo_loop(user_interface_agent, stream=True)

View File

@ -0,0 +1,4 @@
firecrawl-py
openai
google-search-results
git+https://github.com/openai/swarm.git

View File

@ -0,0 +1,3 @@
OPENAI_API_KEY=
FIRECRAWL_API_KEY=
SERP_API_KEY=

View File

@ -0,0 +1,78 @@
import csv
import json
import os
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
from openai import OpenAI
from serpapi import GoogleSearch
from swarm import Agent
from swarm.repl import run_demo_loop
load_dotenv()
# Initialize FirecrawlApp and OpenAI
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def crawl_and_analyze_url(url, objective):
"""Crawl a website using Firecrawl and analyze the content."""
print(f"Parameters: url={url}, objective={objective}")
# Crawl the website
crawl_status = app.crawl_url(
url,
params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}},
poll_interval=5
)
crawl_status = crawl_status['data']
# Process each 'markdown' element individually
combined_results = []
for item in crawl_status:
if 'markdown' in item:
content = item['markdown']
# Analyze the content
analysis = generate_completion(
"website data extractor",
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
"Objective: " + objective + "\nContent: " + content
)
# Parse the JSON result
try:
result = json.loads(analysis)
combined_results.append(result)
except json.JSONDecodeError:
print(f"Could not parse JSON from analysis: {analysis}")
# Combine the results
return {"objective": objective, "results": combined_results}
def generate_completion(role, task, content):
"""Generate a completion using OpenAI."""
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": f"You are a {role}. {task}"},
{"role": "user", "content": content}
]
)
return response.choices[0].message.content
def handoff_to_crawl_url():
"""Hand off the url to the crawl url agent."""
return crawl_website_agent
user_interface_agent = Agent(
name="User Interface Agent",
instructions="You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise.",
functions=[handoff_to_crawl_url],
)
crawl_website_agent = Agent(
name="Crawl Website Agent",
instructions="You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console.",
functions=[crawl_and_analyze_url],
)
if __name__ == "__main__":
# Run the demo loop with the user interface agent
run_demo_loop(user_interface_agent, stream=True)

View File

@ -0,0 +1,4 @@
firecrawl-py
openai
google-search-results
git+https://github.com/openai/swarm.git

View File

@ -98,7 +98,7 @@
"source": [
"# Create a cache with a 5 minute TTL\n",
"cache = caching.CachedContent.create(\n",
" model=\"models/gemini-1.5-pro-001\",\n",
" model=\"models/gemini-1.5-pro-002\",\n",
" display_name=\"website crawl testing again\", # used to identify the cache\n",
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
" contents=[text_file],\n",

View File

@ -0,0 +1,166 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import os\n",
"import datetime\n",
"import time\n",
"import google.generativeai as genai\n",
"from google.generativeai import caching\n",
"from dotenv import load_dotenv\n",
"from firecrawl import FirecrawlApp\n",
"import json\n",
"\n",
"# Load environment variables\n",
"load_dotenv()\n",
"\n",
"# Retrieve API keys from environment variables\n",
"google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
"firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
"\n",
"# Configure the Google Generative AI module with the API key\n",
"genai.configure(api_key=google_api_key)\n",
"\n",
"# Initialize the FirecrawlApp with your API key\n",
"app = FirecrawlApp(api_key=firecrawl_api_key)\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No data returned from crawl.\n"
]
}
],
"source": [
"# Crawl a website\n",
"crawl_url = 'https://dify.ai/'\n",
"params = {\n",
" \n",
" 'crawlOptions': {\n",
" 'limit': 100\n",
" }\n",
"}\n",
"crawl_result = app.crawl_url(crawl_url, params=params)\n",
"\n",
"if crawl_result is not None:\n",
" # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
" cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
"\n",
" # Save the modified results as a text file containing JSON data\n",
" with open('crawl_result.txt', 'w') as file:\n",
" file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
"else:\n",
" print(\"No data returned from crawl.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Upload the video using the Files API\n",
"text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
"\n",
"# Wait for the file to finish processing\n",
"while text_file.state.name == \"PROCESSING\":\n",
" print('Waiting for file to be processed.')\n",
" time.sleep(2)\n",
" text_file = genai.get_file(text_file.name)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Create a cache with a 5 minute TTL\n",
"cache = caching.CachedContent.create(\n",
" model=\"models/gemini-1.5-flash-002\",\n",
" display_name=\"website crawl testing again\", # used to identify the cache\n",
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
" contents=[text_file],\n",
" ttl=datetime.timedelta(minutes=15),\n",
")\n",
"# Construct a GenerativeModel which uses the created cache.\n",
"model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
"\n",
"Here's how Firecrawl helps:\n",
"\n",
"* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
"* **Clean Output:** Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
"* **Parallel Crawling:** Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
"\n",
"You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
"\n",
"Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
"\n"
]
}
],
"source": [
"# Query the model\n",
"response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
"response_dict = response.to_dict()\n",
"response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
"print(response_text)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}