mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-15 19:22:19 +08:00
Merge remote-tracking branch 'origin/main' into pr/765
This commit is contained in:
commit
d301c1bf0f
20
.github/workflows/check-queues.yml
vendored
20
.github/workflows/check-queues.yml
vendored
|
@ -1,20 +0,0 @@
|
|||
name: Check Queues
|
||||
on:
|
||||
schedule:
|
||||
- cron: '*/5 * * * *'
|
||||
|
||||
env:
|
||||
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||
|
||||
jobs:
|
||||
clean-jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Send GET request to check queues
|
||||
run: |
|
||||
response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/check-queues)
|
||||
if [ "$response" -ne 200 ]; then
|
||||
echo "Failed to check queues. Response: $response"
|
||||
exit 1
|
||||
fi
|
||||
echo "Successfully checked queues. Response: $response"
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -28,3 +28,5 @@ apps/js-sdk/firecrawl/dist
|
|||
|
||||
/examples/o1_web_crawler/firecrawl_env
|
||||
/examples/crm_lead_enrichment/crm_lead_enrichment_env
|
||||
/.venv
|
||||
/examples/claude_web_crawler/firecrawl_env
|
||||
|
|
30
README.md
30
README.md
|
@ -1,4 +1,5 @@
|
|||
<h3 align="center">
|
||||
<a name="readme-top"></a>
|
||||
<img
|
||||
src="https://raw.githubusercontent.com/mendableai/firecrawl/main/img/firecrawl_logo.png"
|
||||
height="200"
|
||||
|
@ -79,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
|
|||
- **Media parsing**: pdfs, docx, images.
|
||||
- **Reliability first**: designed to get the data you need - no matter how hard it is.
|
||||
- **Actions**: click, scroll, input, wait and more before extracting data
|
||||
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
|
||||
|
||||
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
|
||||
|
||||
|
@ -349,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
|
|||
}'
|
||||
```
|
||||
|
||||
### Batch Scraping Multiple URLs (New)
|
||||
|
||||
You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"],
|
||||
"formats" : ["markdown", "html"]
|
||||
}'
|
||||
```
|
||||
|
||||
### Search (v0) (Beta)
|
||||
|
||||
|
@ -482,7 +497,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
|||
scrapeOptions: {
|
||||
formats: ['markdown', 'html'],
|
||||
}
|
||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||
} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse;
|
||||
|
||||
if (crawlResponse) {
|
||||
console.log(crawlResponse)
|
||||
|
@ -541,6 +556,12 @@ We love contributions! Please read our [contributing guide](CONTRIBUTING.md) bef
|
|||
|
||||
_It is the sole responsibility of the end users to respect websites' policies when scraping, searching and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions._
|
||||
|
||||
## Contributors
|
||||
|
||||
<a href="https://github.com/mendableai/firecrawl/graphs/contributors">
|
||||
<img alt="contributors" src="https://contrib.rocks/image?repo=mendableai/firecrawl"/>
|
||||
</a>
|
||||
|
||||
## License Disclaimer
|
||||
|
||||
This project is primarily licensed under the GNU Affero General Public License v3.0 (AGPL-3.0), as specified in the LICENSE file in the root directory of this repository. However, certain components of this project are licensed under the MIT License. Refer to the LICENSE files in these specific directories for details.
|
||||
|
@ -552,3 +573,10 @@ Please note:
|
|||
- When using or contributing to this project, ensure you comply with the appropriate license terms for the specific component you are working with.
|
||||
|
||||
For more details on the licensing of specific components, please refer to the LICENSE files in the respective directories or contact the project maintainers.
|
||||
|
||||
|
||||
<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
|
||||
<a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
|
||||
↑ Back to Top ↑
|
||||
</a>
|
||||
</p>
|
||||
|
|
10
SELF_HOST.md
10
SELF_HOST.md
|
@ -36,7 +36,7 @@ Self-hosting Firecrawl is ideal for those who need full control over their scrap
|
|||
|
||||
Create an `.env` in the root directory you can copy over the template in `apps/api/.env.example`
|
||||
|
||||
To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features)
|
||||
To start, we won't set up authentication or any optional subservices (pdf parsing, JS blocking support, AI features)
|
||||
|
||||
`.env:`
|
||||
```
|
||||
|
@ -47,7 +47,7 @@ HOST=0.0.0.0
|
|||
REDIS_URL=redis://redis:6379
|
||||
REDIS_RATE_LIMIT_URL=redis://redis:6379
|
||||
|
||||
## To turn on DB authentication, you need to set up supabase.
|
||||
## To turn on DB authentication, you need to set up Supabase.
|
||||
USE_DB_AUTHENTICATION=false
|
||||
|
||||
# ===== Optional ENVS ======
|
||||
|
@ -59,8 +59,8 @@ SUPABASE_SERVICE_TOKEN=
|
|||
|
||||
# Other Optionals
|
||||
TEST_API_KEY= # use if you've set up authentication and want to test with a real API key
|
||||
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
|
||||
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
|
||||
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
|
||||
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
|
||||
BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
|
@ -176,4 +176,4 @@ By addressing these common issues, you can ensure a smoother setup and operation
|
|||
|
||||
## Install Firecrawl on a Kubernetes Cluster (Simple Version)
|
||||
|
||||
Read the [examples/kubernetes/cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes/cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
|
||||
Read the [examples/kubernetes/cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes/cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# ===== Required ENVS ======
|
||||
NUM_WORKERS_PER_QUEUE=8
|
||||
NUM_WORKERS_PER_QUEUE=8
|
||||
PORT=3002
|
||||
HOST=0.0.0.0
|
||||
REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
||||
|
@ -11,9 +11,14 @@ USE_DB_AUTHENTICATION=true
|
|||
|
||||
# ===== Optional ENVS ======
|
||||
|
||||
# SearchApi key. Head to https://searchapi.com/ to get your API key
|
||||
SEARCHAPI_API_KEY=
|
||||
# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
|
||||
SEARCHAPI_ENGINE=
|
||||
|
||||
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
|
||||
SUPABASE_ANON_TOKEN=
|
||||
SUPABASE_URL=
|
||||
SUPABASE_ANON_TOKEN=
|
||||
SUPABASE_URL=
|
||||
SUPABASE_SERVICE_TOKEN=
|
||||
|
||||
# Other Optionals
|
||||
|
|
|
@ -12,4 +12,4 @@ ANTHROPIC_API_KEY=
|
|||
BULL_AUTH_KEY=
|
||||
LOGTAIL_KEY=
|
||||
PLAYWRIGHT_MICROSERVICE_URL=
|
||||
|
||||
SEARCHAPI_API_KEY=
|
||||
|
|
|
@ -121,6 +121,49 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/abs/2410.04840",
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.markdown).toContain("Strong Model Collapse");
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
expect(response.body.data.metadata.description).toContain("Abstract page for arXiv paper 2410.04840: Strong Model Collapse");
|
||||
expect(response.body.data.metadata.citation_title).toBe("Strong Model Collapse");
|
||||
expect(response.body.data.metadata.citation_author).toEqual([
|
||||
"Dohmatob, Elvis",
|
||||
"Feng, Yunzhen",
|
||||
"Subramonian, Arjun",
|
||||
"Kempe, Julia"
|
||||
]);
|
||||
expect(response.body.data.metadata.citation_date).toBe("2024/10/07");
|
||||
expect(response.body.data.metadata.citation_online_date).toBe("2024/10/08");
|
||||
expect(response.body.data.metadata.citation_pdf_url).toBe("http://arxiv.org/pdf/2410.04840");
|
||||
expect(response.body.data.metadata.citation_arxiv_id).toBe("2410.04840");
|
||||
expect(response.body.data.metadata.citation_abstract).toContain("Within the scaling laws paradigm");
|
||||
expect(response.body.data.metadata.sourceURL).toBe("https://arxiv.org/abs/2410.04840");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
},
|
||||
30000
|
||||
);
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
|
|
|
@ -13,7 +13,7 @@ import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
|||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { redlock } from "../services/redlock";
|
||||
import { getValue } from "../services/redis";
|
||||
import { deleteKey, getValue } from "../services/redis";
|
||||
import { setValue } from "../services/redis";
|
||||
import { validate } from "uuid";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
@ -37,12 +37,17 @@ function normalizedApiIsUuid(potentialUuid: string): boolean {
|
|||
return validate(potentialUuid);
|
||||
}
|
||||
|
||||
export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)) {
|
||||
export async function setCachedACUC(
|
||||
api_key: string,
|
||||
acuc:
|
||||
| AuthCreditUsageChunk
|
||||
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk)
|
||||
) {
|
||||
const cacheKeyACUC = `acuc_${api_key}`;
|
||||
const redLockKey = `lock_${cacheKeyACUC}`;
|
||||
|
||||
try {
|
||||
await redlock.using([redLockKey], 10000, {}, async signal => {
|
||||
await redlock.using([redLockKey], 10000, {}, async (signal) => {
|
||||
if (typeof acuc === "function") {
|
||||
acuc = acuc(JSON.parse(await getValue(cacheKeyACUC)));
|
||||
|
||||
|
@ -68,31 +73,60 @@ export async function setCachedACUC(api_key: string, acuc: AuthCreditUsageChunk
|
|||
}
|
||||
}
|
||||
|
||||
export async function getACUC(api_key: string, cacheOnly = false): Promise<AuthCreditUsageChunk | null> {
|
||||
export async function getACUC(
|
||||
api_key: string,
|
||||
cacheOnly = false,
|
||||
useCache = true
|
||||
): Promise<AuthCreditUsageChunk | null> {
|
||||
const cacheKeyACUC = `acuc_${api_key}`;
|
||||
|
||||
const cachedACUC = await getValue(cacheKeyACUC);
|
||||
if (useCache) {
|
||||
const cachedACUC = await getValue(cacheKeyACUC);
|
||||
if (cachedACUC !== null) {
|
||||
return JSON.parse(cachedACUC);
|
||||
}
|
||||
}
|
||||
|
||||
if (cachedACUC !== null) {
|
||||
return JSON.parse(cachedACUC);
|
||||
} else if (!cacheOnly) {
|
||||
const { data, error } =
|
||||
await supabase_service.rpc("auth_credit_usage_chunk", { input_key: api_key });
|
||||
|
||||
if (error) {
|
||||
throw new Error("Failed to retrieve authentication and credit usage data: " + JSON.stringify(error));
|
||||
if (!cacheOnly) {
|
||||
let data;
|
||||
let error;
|
||||
let retries = 0;
|
||||
const maxRetries = 5;
|
||||
|
||||
while (retries < maxRetries) {
|
||||
({ data, error } = await supabase_service.rpc(
|
||||
"auth_credit_usage_chunk_test_21_credit_pack",
|
||||
{ input_key: api_key }
|
||||
));
|
||||
|
||||
if (!error) {
|
||||
break;
|
||||
}
|
||||
|
||||
Logger.warn(
|
||||
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
|
||||
);
|
||||
retries++;
|
||||
if (retries === maxRetries) {
|
||||
throw new Error(
|
||||
"Failed to retrieve authentication and credit usage data after 3 attempts: " +
|
||||
JSON.stringify(error)
|
||||
);
|
||||
}
|
||||
|
||||
// Wait for a short time before retrying
|
||||
await new Promise((resolve) => setTimeout(resolve, 200));
|
||||
}
|
||||
|
||||
const chunk: AuthCreditUsageChunk | null = data.length === 0
|
||||
? null
|
||||
: data[0].team_id === null
|
||||
? null
|
||||
: data[0];
|
||||
const chunk: AuthCreditUsageChunk | null =
|
||||
data.length === 0 ? null : data[0].team_id === null ? null : data[0];
|
||||
|
||||
// NOTE: Should we cache null chunks? - mogery
|
||||
if (chunk !== null) {
|
||||
if (chunk !== null && useCache) {
|
||||
setCachedACUC(api_key, chunk);
|
||||
}
|
||||
|
||||
// console.log(chunk);
|
||||
|
||||
return chunk;
|
||||
} else {
|
||||
|
@ -100,6 +134,13 @@ export async function getACUC(api_key: string, cacheOnly = false): Promise<AuthC
|
|||
}
|
||||
}
|
||||
|
||||
export async function clearACUC(
|
||||
api_key: string,
|
||||
): Promise<void> {
|
||||
const cacheKeyACUC = `acuc_${api_key}`;
|
||||
await deleteKey(cacheKeyACUC);
|
||||
}
|
||||
|
||||
export async function authenticateUser(
|
||||
req,
|
||||
res,
|
||||
|
@ -132,7 +173,11 @@ export async function supaAuthenticateUser(
|
|||
plan?: PlanType;
|
||||
chunk?: AuthCreditUsageChunk;
|
||||
}> {
|
||||
const authHeader = req.headers.authorization ?? (req.headers["sec-websocket-protocol"] ? `Bearer ${req.headers["sec-websocket-protocol"]}` : null);
|
||||
const authHeader =
|
||||
req.headers.authorization ??
|
||||
(req.headers["sec-websocket-protocol"]
|
||||
? `Bearer ${req.headers["sec-websocket-protocol"]}`
|
||||
: null);
|
||||
if (!authHeader) {
|
||||
return { success: false, error: "Unauthorized", status: 401 };
|
||||
}
|
||||
|
@ -162,7 +207,7 @@ export async function supaAuthenticateUser(
|
|||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
} else {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
}
|
||||
}
|
||||
teamId = "preview";
|
||||
} else {
|
||||
normalizedApi = parseApi(token);
|
||||
|
|
22
apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
Normal file
22
apps/api/src/controllers/v0/admin/acuc-cache-clear.ts
Normal file
|
@ -0,0 +1,22 @@
|
|||
import { Request, Response } from "express";
|
||||
import { supabase_service } from "../../../services/supabase";
|
||||
import { clearACUC } from "../../auth";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
|
||||
export async function acucCacheClearController(req: Request, res: Response) {
|
||||
try {
|
||||
const team_id: string = req.body.team_id;
|
||||
|
||||
const keys = await supabase_service
|
||||
.from("api_keys")
|
||||
.select("*")
|
||||
.eq("team_id", team_id);
|
||||
|
||||
await Promise.all(keys.data.map((x) => clearACUC(x.key)));
|
||||
|
||||
res.json({ ok: true });
|
||||
} catch (error) {
|
||||
Logger.error(`Error clearing ACUC cache via API route: ${error}`);
|
||||
res.status(500).json({ error: "Internal server error" });
|
||||
}
|
||||
}
|
|
@ -60,7 +60,7 @@ export async function crawlStatusController(req: Request, res: Response) {
|
|||
}));
|
||||
|
||||
// Filter out failed jobs
|
||||
jobsWithStatuses = jobsWithStatuses.filter(x => x.status !== "failed");
|
||||
jobsWithStatuses = jobsWithStatuses.filter(x => x.status !== "failed" && x.status !== "unknown");
|
||||
|
||||
// Sort jobs by timestamp
|
||||
jobsWithStatuses.sort((a, b) => a.job.timestamp - b.job.timestamp);
|
||||
|
|
103
apps/api/src/controllers/v1/batch-scrape.ts
Normal file
103
apps/api/src/controllers/v1/batch-scrape.ts
Normal file
|
@ -0,0 +1,103 @@
|
|||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
BatchScrapeRequest,
|
||||
batchScrapeRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJobs,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function batchScrapeController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
|
||||
res: Response<CrawlResponse>
|
||||
) {
|
||||
req.body = batchScrapeRequestSchema.parse(req.body);
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
let { remainingCredits } = req.account;
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
if(!useDbAuthentication){
|
||||
remainingCredits = Infinity;
|
||||
}
|
||||
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
crawlerOptions: null,
|
||||
pageOptions,
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
};
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
let jobPriority = 20;
|
||||
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if(req.body.urls.length > 1000){
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
||||
}
|
||||
|
||||
const jobs = req.body.urls.map((x) => {
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url: x,
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions: null,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
v1: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId)
|
||||
);
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
|
||||
const protocol = process.env.ENV === "local" ? req.protocol : "https";
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${protocol}://${req.get("host")}/v1/batch/scrape/${id}`,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -97,12 +97,23 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
|
|||
let jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
let jobStatuses = await Promise.all(jobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)] as const));
|
||||
const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id));
|
||||
jobStatuses = jobStatuses.filter(x => !throttledJobs.has(x[0])); // throttled jobs can have a failed status, but they are not actually failed
|
||||
// filter out failed jobs
|
||||
jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed"));
|
||||
// filter the job statues
|
||||
jobStatuses = jobStatuses.filter(x => x[1] !== "failed");
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping";
|
||||
|
||||
const throttledJobsSet = new Set(throttledJobs);
|
||||
|
||||
const validJobStatuses = [];
|
||||
const validJobIDs = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
||||
validJobStatuses.push([id, status]);
|
||||
validJobIDs.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : validJobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping";
|
||||
|
||||
jobIDs = validJobIDs; // Use validJobIDs instead of jobIDs for further processing
|
||||
|
||||
const doneJobs = await getJobs(doneJobIDs);
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ export async function getJobs(ids: string[]) {
|
|||
return jobs;
|
||||
}
|
||||
|
||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
|
||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>, isBatch = false) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ success: false, error: "Job not found" });
|
||||
|
@ -60,12 +60,24 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
let jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
let jobStatuses = await Promise.all(jobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)] as const));
|
||||
const throttledJobs = new Set(...await getThrottledJobs(req.auth.team_id));
|
||||
jobStatuses = jobStatuses.filter(x => !throttledJobs.has(x[0])); // throttled jobs can have a failed status, but they are not actually failed
|
||||
// filter out failed jobs
|
||||
jobIDs = jobIDs.filter(id => !jobStatuses.some(status => status[0] === id && status[1] === "failed"));
|
||||
// filter the job statues
|
||||
jobStatuses = jobStatuses.filter(x => x[1] !== "failed");
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping";
|
||||
|
||||
const throttledJobsSet = new Set(throttledJobs);
|
||||
|
||||
const validJobStatuses = [];
|
||||
const validJobIDs = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
|
||||
validJobStatuses.push([id, status]);
|
||||
validJobIDs.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : validJobStatuses.every(x => x[1] === "completed") ? "completed" : "scraping";
|
||||
|
||||
// Use validJobIDs instead of jobIDs for further processing
|
||||
jobIDs = validJobIDs;
|
||||
|
||||
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
||||
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
||||
|
||||
|
@ -100,7 +112,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
|
|||
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
|
||||
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/${isBatch ? "batch/scrape" : "crawl"}/${req.params.jobId}`);
|
||||
|
||||
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||
|
||||
|
|
|
@ -78,7 +78,7 @@ export async function crawlController(
|
|||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification);
|
||||
} catch (e) {
|
||||
Logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
|
|
|
@ -63,7 +63,7 @@ export async function mapController(
|
|||
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
|
||||
|
||||
const cacheKey = `fireEngineMap:${mapUrl}`;
|
||||
const cachedResult = await redis.get(cacheKey);
|
||||
const cachedResult = null;
|
||||
|
||||
let allResults: any[];
|
||||
let pagePromises: Promise<any>[];
|
||||
|
|
|
@ -139,7 +139,7 @@ export async function scrapeController(
|
|||
crawlerOptions: {},
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
|||
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
|
||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||
import { PlanType } from "../../types";
|
||||
import { countries } from "../../lib/validate-country";
|
||||
|
||||
export type Format =
|
||||
| "markdown"
|
||||
|
@ -108,6 +109,28 @@ export const scrapeOptions = z.object({
|
|||
extract: extractOptions.optional(),
|
||||
parsePDF: z.boolean().default(true),
|
||||
actions: actionsSchema.optional(),
|
||||
// New
|
||||
location: z.object({
|
||||
country: z.string().optional().refine(
|
||||
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
||||
{
|
||||
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
||||
}
|
||||
).transform(val => val ? val.toUpperCase() : 'US'),
|
||||
languages: z.string().array().optional(),
|
||||
}).optional(),
|
||||
|
||||
// Deprecated
|
||||
geolocation: z.object({
|
||||
country: z.string().optional().refine(
|
||||
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
||||
{
|
||||
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
||||
}
|
||||
).transform(val => val ? val.toUpperCase() : 'US'),
|
||||
languages: z.string().array().optional(),
|
||||
}).optional(),
|
||||
skipTlsVerification: z.boolean().default(false),
|
||||
}).strict(strictMessage)
|
||||
|
||||
|
||||
|
@ -132,19 +155,29 @@ export const scrapeRequestSchema = scrapeOptions.extend({
|
|||
return obj;
|
||||
});
|
||||
|
||||
// export type ScrapeRequest = {
|
||||
// url: string;
|
||||
// formats?: Format[];
|
||||
// headers?: { [K: string]: string };
|
||||
// includeTags?: string[];
|
||||
// excludeTags?: string[];
|
||||
// onlyMainContent?: boolean;
|
||||
// timeout?: number;
|
||||
// waitFor?: number;
|
||||
// }
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
|
||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||
urls: url.array(),
|
||||
origin: z.string().optional().default("api"),
|
||||
}).strict(strictMessage).refine(
|
||||
(obj) => {
|
||||
const hasExtractFormat = obj.formats?.includes("extract");
|
||||
const hasExtractOptions = obj.extract !== undefined;
|
||||
return (hasExtractFormat && hasExtractOptions) || (!hasExtractFormat && !hasExtractOptions);
|
||||
},
|
||||
{
|
||||
message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
|
||||
}
|
||||
).transform((obj) => {
|
||||
if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
|
||||
return { ...obj, timeout: 60000 };
|
||||
}
|
||||
return obj;
|
||||
});
|
||||
|
||||
export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
|
||||
|
||||
const crawlerOptions = z.object({
|
||||
includePaths: z.string().array().default([]),
|
||||
excludePaths: z.string().array().default([]),
|
||||
|
@ -250,6 +283,8 @@ export type Document = {
|
|||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
error?: string;
|
||||
[key: string]: string | string[] | number | undefined;
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -340,6 +375,8 @@ export type AuthCreditUsageChunk = {
|
|||
coupons: any[];
|
||||
adjusted_credits_used: number; // credits this period minus coupons used
|
||||
remaining_credits: number;
|
||||
sub_user_id: string | null;
|
||||
total_credits_sum: number;
|
||||
};
|
||||
|
||||
export interface RequestWithMaybeACUC<
|
||||
|
@ -421,6 +458,8 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
parsePDF: x.parsePDF,
|
||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||
geolocation: x.location ?? x.geolocation,
|
||||
skipTlsVerification: x.skipTlsVerification
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
|
|||
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
||||
import { ZodError } from "zod";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import dns from 'node:dns';
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
|
@ -28,13 +29,13 @@ const { ExpressAdapter } = require("@bull-board/express");
|
|||
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
|
||||
Logger.info(`Number of CPUs: ${numCPUs} available`);
|
||||
|
||||
const cacheable = new CacheableLookup({
|
||||
// this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme
|
||||
lookup:false
|
||||
});
|
||||
const cacheable = new CacheableLookup()
|
||||
|
||||
|
||||
// Install cacheable lookup for all other requests
|
||||
cacheable.install(http.globalAgent);
|
||||
cacheable.install(https.globalAgent)
|
||||
cacheable.install(https.globalAgent);
|
||||
|
||||
|
||||
const ws = expressWs(express());
|
||||
const app = ws.app;
|
||||
|
|
|
@ -6,7 +6,13 @@ export function numTokensFromString(message: string, model: string): number {
|
|||
const encoder = encoding_for_model(model as TiktokenModel);
|
||||
|
||||
// Encode the message into tokens
|
||||
const tokens = encoder.encode(message);
|
||||
let tokens: Uint32Array;
|
||||
try {
|
||||
tokens = encoder.encode(message);
|
||||
} catch (error) {
|
||||
message = message.replace("<|endoftext|>", "");
|
||||
tokens = encoder.encode(message);
|
||||
}
|
||||
|
||||
// Free the encoder resources after use
|
||||
encoder.free();
|
||||
|
|
|
@ -3,7 +3,7 @@ import { redisConnection } from "../services/queue-service";
|
|||
import { Logger } from "./logger";
|
||||
|
||||
export type StoredCrawl = {
|
||||
originUrl: string;
|
||||
originUrl?: string;
|
||||
crawlerOptions: any;
|
||||
pageOptions: any;
|
||||
team_id: string;
|
||||
|
|
|
@ -51,6 +51,10 @@ export type PageOptions = {
|
|||
disableJsDom?: boolean; // beta
|
||||
atsv?: boolean; // anti-bot solver, beta
|
||||
actions?: Action[]; // beta
|
||||
geolocation?: {
|
||||
country?: string;
|
||||
};
|
||||
skipTlsVerification?: boolean;
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
|
|
2261
apps/api/src/lib/validate-country.ts
Normal file
2261
apps/api/src/lib/validate-country.ts
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -112,7 +112,7 @@ export async function runWebScraper({
|
|||
}
|
||||
|
||||
// remove docs with empty content
|
||||
const filteredDocs = crawlerOptions.returnOnlyUrls
|
||||
const filteredDocs = crawlerOptions?.returnOnlyUrls
|
||||
? docs.map((doc) => {
|
||||
if (doc.metadata.sourceURL) {
|
||||
return { url: doc.metadata.sourceURL };
|
||||
|
@ -121,8 +121,13 @@ export async function runWebScraper({
|
|||
: docs;
|
||||
|
||||
if(is_scrape === false) {
|
||||
billTeam(team_id, undefined, filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
|
||||
billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
|
|
@ -6,6 +6,8 @@ import {
|
|||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/v0/admin/queue";
|
||||
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
|
||||
import { wrap } from "./v1";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
|
@ -33,3 +35,8 @@ adminRouter.get(
|
|||
`/admin/${process.env.BULL_AUTH_KEY}/autoscaler`,
|
||||
autoscalerController
|
||||
);
|
||||
|
||||
adminRouter.post(
|
||||
`/admin/${process.env.BULL_AUTH_KEY}/acuc-cache-clear`,
|
||||
wrap(acucCacheClearController),
|
||||
);
|
||||
|
|
|
@ -17,6 +17,7 @@ import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
|||
import { Logger } from "../lib/logger";
|
||||
import { scrapeStatusController } from "../controllers/v1/scrape-status";
|
||||
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
|
||||
import { batchScrapeController } from "../controllers/v1/batch-scrape";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||
// import { searchController } from "../../src/controllers/v1/search";
|
||||
|
@ -29,14 +30,14 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||
return (req, res, next) => {
|
||||
(async () => {
|
||||
if (!minimum && req.body) {
|
||||
minimum = (req.body as any)?.limit ?? 1;
|
||||
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
|
||||
}
|
||||
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum);
|
||||
req.acuc = chunk;
|
||||
if (!success) {
|
||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
if (!res.headersSent) {
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing." });
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
|
||||
}
|
||||
}
|
||||
req.account = { remainingCredits };
|
||||
|
@ -94,7 +95,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
|||
next();
|
||||
}
|
||||
|
||||
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
export function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
return (req, res, next) => {
|
||||
controller(req, res)
|
||||
.catch(err => next(err))
|
||||
|
@ -122,6 +123,15 @@ v1Router.post(
|
|||
wrap(crawlController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/batch/scrape",
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
checkCreditsMiddleware(),
|
||||
blocklistMiddleware,
|
||||
idempotencyMiddleware,
|
||||
wrap(batchScrapeController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/map",
|
||||
authMiddleware(RateLimiterMode.Map),
|
||||
|
@ -136,6 +146,13 @@ v1Router.get(
|
|||
wrap(crawlStatusController)
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/batch/scrape/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
// Yes, it uses the same controller as the normal crawl status controller
|
||||
wrap((req:any, res):any => crawlStatusController(req, res, true))
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/scrape/:jobId",
|
||||
wrap(scrapeStatusController)
|
||||
|
|
|
@ -9,7 +9,7 @@ import robotsParser from "robots-parser";
|
|||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
import https from "https";
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
private initialUrl: string;
|
||||
|
@ -136,13 +136,23 @@ export class WebCrawler {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (this.isFile(link)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
})
|
||||
.slice(0, limit);
|
||||
}
|
||||
|
||||
public async getRobotsTxt(): Promise<string> {
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
|
||||
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
|
||||
let extraArgs = {};
|
||||
if(skipTlsVerification) {
|
||||
extraArgs["httpsAgent"] = new https.Agent({
|
||||
rejectUnauthorized: false
|
||||
});
|
||||
}
|
||||
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
|
||||
return response.data;
|
||||
}
|
||||
|
||||
|
@ -478,7 +488,14 @@ export class WebCrawler {
|
|||
".webp",
|
||||
".inc"
|
||||
];
|
||||
return fileExtensions.some((ext) => url.toLowerCase().endsWith(ext));
|
||||
|
||||
try {
|
||||
const urlWithoutQuery = url.split('?')[0].toLowerCase();
|
||||
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
|
||||
} catch (error) {
|
||||
Logger.error(`Error processing URL in isFile: ${error}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private isSocialMediaOrEmail(url: string): boolean {
|
||||
|
|
|
@ -593,6 +593,8 @@ export class WebScraperDataProvider {
|
|||
disableJsDom: options.pageOptions?.disableJsDom ?? false,
|
||||
atsv: options.pageOptions?.atsv ?? false,
|
||||
actions: options.pageOptions?.actions ?? undefined,
|
||||
geolocation: options.pageOptions?.geolocation ?? undefined,
|
||||
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
|
|
|
@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
|
|||
waitFor = 0,
|
||||
screenshot = false,
|
||||
fullPageScreenshot = false,
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false },
|
||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
|
||||
fireEngineOptions = {},
|
||||
headers,
|
||||
options,
|
||||
|
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
|
|||
waitFor?: number;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean };
|
||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
|
||||
fireEngineOptions?: FireEngineOptions;
|
||||
headers?: Record<string, string>;
|
||||
options?: any;
|
||||
|
@ -118,6 +118,8 @@ export async function scrapWithFireEngine({
|
|||
...fireEngineOptionsParam,
|
||||
atsv: pageOptions?.atsv ?? false,
|
||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||
geolocation: pageOptions?.geolocation,
|
||||
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
|
||||
actions: actions,
|
||||
},
|
||||
{
|
||||
|
|
|
@ -156,6 +156,8 @@ export async function scrapSingleUrl(
|
|||
disableJsDom: pageOptions.disableJsDom ?? false,
|
||||
atsv: pageOptions.atsv ?? false,
|
||||
actions: pageOptions.actions ?? undefined,
|
||||
geolocation: pageOptions.geolocation ?? undefined,
|
||||
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
|
@ -207,14 +209,15 @@ export async function scrapSingleUrl(
|
|||
if (action.type === "click" || action.type === "write" || action.type === "press") {
|
||||
const result: Action[] = [];
|
||||
// Don't add a wait if the previous action is a wait
|
||||
if (index === 0 || array[index - 1].type !== "wait") {
|
||||
result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
}
|
||||
// if (index === 0 || array[index - 1].type !== "wait") {
|
||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
// }
|
||||
// Fire-engine now handles wait times automatically, leaving the code here for now
|
||||
result.push(action);
|
||||
// Don't add a wait if the next action is a wait
|
||||
if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
||||
result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
}
|
||||
// if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
// }
|
||||
return result;
|
||||
}
|
||||
return [action as Action];
|
||||
|
|
|
@ -3,10 +3,8 @@ export const excludeNonMainTags = [
|
|||
"footer",
|
||||
"nav",
|
||||
"aside",
|
||||
".header",
|
||||
".top",
|
||||
".navbar",
|
||||
"#header",
|
||||
".footer",
|
||||
".bottom",
|
||||
"#footer",
|
||||
|
@ -39,8 +37,6 @@ export const excludeNonMainTags = [
|
|||
"#search",
|
||||
".share",
|
||||
"#share",
|
||||
".widget",
|
||||
"#widget",
|
||||
".cookie",
|
||||
"#cookie"
|
||||
];
|
||||
|
|
|
@ -34,6 +34,7 @@ interface Metadata {
|
|||
sourceURL?: string;
|
||||
pageStatusCode?: number;
|
||||
pageError?: string;
|
||||
[key: string]: string | string[] | number | undefined;
|
||||
}
|
||||
|
||||
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
|
@ -70,40 +71,78 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||
let pageStatusCode: number | null = null;
|
||||
let pageError: string | null = null;
|
||||
|
||||
const customMetadata: Record<string, string | string[]> = {};
|
||||
|
||||
try {
|
||||
// TODO: remove this as it is redundant with the below implementation
|
||||
title = soup("title").text() || null;
|
||||
description = soup('meta[name="description"]').attr("content") || null;
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
language = soup('html').attr('lang') || null;
|
||||
|
||||
language = soup("html").attr("lang") || null;
|
||||
|
||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
|
||||
ogDescription = soup('meta[property="og:description"]').attr("content") || null;
|
||||
ogDescription =
|
||||
soup('meta[property="og:description"]').attr("content") || null;
|
||||
ogUrl = soup('meta[property="og:url"]').attr("content") || null;
|
||||
ogImage = soup('meta[property="og:image"]').attr("content") || null;
|
||||
ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
|
||||
ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null;
|
||||
ogDeterminer =
|
||||
soup('meta[property="og:determiner"]').attr("content") || null;
|
||||
ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
|
||||
ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null;
|
||||
ogLocaleAlternate =
|
||||
soup('meta[property="og:locale:alternate"]')
|
||||
.map((i, el) => soup(el).attr("content"))
|
||||
.get() || null;
|
||||
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
|
||||
ogVideo = soup('meta[property="og:video"]').attr("content") || null;
|
||||
articleSection = soup('meta[name="article:section"]').attr("content") || null;
|
||||
articleSection =
|
||||
soup('meta[name="article:section"]').attr("content") || null;
|
||||
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
|
||||
publishedTime = soup('meta[property="article:published_time"]').attr("content") || null;
|
||||
modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null;
|
||||
dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null;
|
||||
publishedTime =
|
||||
soup('meta[property="article:published_time"]').attr("content") || null;
|
||||
modifiedTime =
|
||||
soup('meta[property="article:modified_time"]').attr("content") || null;
|
||||
dctermsKeywords =
|
||||
soup('meta[name="dcterms.keywords"]').attr("content") || null;
|
||||
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
|
||||
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
|
||||
dctermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || null;
|
||||
dctermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || null;
|
||||
dctermsSubject =
|
||||
soup('meta[name="dcterms.subject"]').attr("content") || null;
|
||||
dctermsAudience =
|
||||
soup('meta[name="dcterms.audience"]').attr("content") || null;
|
||||
dcType = soup('meta[name="dc.type"]').attr("content") || null;
|
||||
dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
|
||||
dcDate = soup('meta[name="dc.date"]').attr("content") || null;
|
||||
dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null;
|
||||
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
|
||||
dcDateCreated =
|
||||
soup('meta[name="dc.date.created"]').attr("content") || null;
|
||||
dctermsCreated =
|
||||
soup('meta[name="dcterms.created"]').attr("content") || null;
|
||||
|
||||
try {
|
||||
// Extract all meta tags for custom metadata
|
||||
soup("meta").each((i, elem) => {
|
||||
try {
|
||||
const name = soup(elem).attr("name") || soup(elem).attr("property");
|
||||
const content = soup(elem).attr("content");
|
||||
|
||||
if (name && content) {
|
||||
if (customMetadata[name] === undefined) {
|
||||
customMetadata[name] = content;
|
||||
} else if (Array.isArray(customMetadata[name])) {
|
||||
(customMetadata[name] as string[]).push(content);
|
||||
} else {
|
||||
customMetadata[name] = [customMetadata[name] as string, content];
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting custom metadata (in): ${error}`);
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting custom metadata: ${error}`);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting metadata: ${error}`);
|
||||
}
|
||||
|
@ -141,5 +180,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||
...(sourceURL ? { sourceURL } : {}),
|
||||
...(pageStatusCode ? { pageStatusCode } : {}),
|
||||
...(pageError ? { pageError } : {}),
|
||||
...customMetadata,
|
||||
};
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import axios, { AxiosResponse } from "axios";
|
||||
import fs from "fs";
|
||||
import fs from "fs/promises";
|
||||
import { createReadStream, createWriteStream } from "node:fs";
|
||||
import FormData from "form-data";
|
||||
import dotenv from "dotenv";
|
||||
|
@ -15,7 +15,7 @@ export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promis
|
|||
try {
|
||||
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
||||
const content = await processPdfToText(tempFilePath, parsePDF);
|
||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||
await fs.unlink(tempFilePath); // Clean up the temporary file
|
||||
return { content, pageStatusCode, pageError };
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to fetch and process PDF: ${error.message}`);
|
||||
|
@ -120,7 +120,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
|||
}
|
||||
} else {
|
||||
try {
|
||||
content = fs.readFileSync(filePath, "utf-8");
|
||||
content = await fs.readFile(filePath, "utf-8");
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to read PDF file: ${error}`);
|
||||
content = "";
|
||||
|
@ -131,7 +131,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
|||
|
||||
async function processPdf(file: string) {
|
||||
try {
|
||||
const fileContent = fs.readFileSync(file);
|
||||
const fileContent = await fs.readFile(file);
|
||||
const data = await pdf(fileContent);
|
||||
return data.text;
|
||||
} catch (error) {
|
||||
|
|
|
@ -6,6 +6,7 @@ import { Logger } from "../lib/logger";
|
|||
|
||||
dotenv.config();
|
||||
|
||||
|
||||
export async function fireEngineMap(
|
||||
q: string,
|
||||
options: {
|
||||
|
@ -41,11 +42,12 @@ export async function fireEngineMap(
|
|||
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"X-Disable-Cache": "true"
|
||||
},
|
||||
data: data,
|
||||
};
|
||||
const response = await axios(config);
|
||||
if (response && response) {
|
||||
if (response && response.data) {
|
||||
return response.data;
|
||||
} else {
|
||||
return [];
|
||||
|
|
|
@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger";
|
|||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { googleSearch } from "./googlesearch";
|
||||
import { fireEngineMap } from "./fireEngine";
|
||||
import { searchapi_search } from "./searchapi";
|
||||
import { serper_search } from "./serper";
|
||||
|
||||
export async function search({
|
||||
|
@ -30,7 +31,16 @@ export async function search({
|
|||
timeout?: number;
|
||||
}): Promise<SearchResult[]> {
|
||||
try {
|
||||
|
||||
if (process.env.SEARCHAPI_API_KEY) {
|
||||
return await searchapi_search(query, {
|
||||
num_results,
|
||||
tbs,
|
||||
filter,
|
||||
lang,
|
||||
country,
|
||||
location
|
||||
});
|
||||
}
|
||||
if (process.env.SERPER_API_KEY) {
|
||||
return await serper_search(query, {
|
||||
num_results,
|
||||
|
|
60
apps/api/src/search/searchapi.ts
Normal file
60
apps/api/src/search/searchapi.ts
Normal file
|
@ -0,0 +1,60 @@
|
|||
import axios from "axios";
|
||||
import dotenv from "dotenv";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
interface SearchOptions {
|
||||
tbs?: string;
|
||||
filter?: string;
|
||||
lang?: string;
|
||||
country?: string;
|
||||
location?: string;
|
||||
num_results: number;
|
||||
page?: number;
|
||||
}
|
||||
|
||||
export async function searchapi_search(q: string, options: SearchOptions): Promise<SearchResult[]> {
|
||||
const params = {
|
||||
q: q,
|
||||
hl: options.lang,
|
||||
gl: options.country,
|
||||
location: options.location,
|
||||
num: options.num_results,
|
||||
page: options.page ?? 1,
|
||||
engine: process.env.SEARCHAPI_ENGINE || "google",
|
||||
};
|
||||
|
||||
const url = `https://www.searchapi.io/api/v1/search`;
|
||||
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
"Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`,
|
||||
"Content-Type": "application/json",
|
||||
"X-SearchApi-Source": "Firecrawl",
|
||||
},
|
||||
params: params,
|
||||
});
|
||||
|
||||
|
||||
if (response.status === 401) {
|
||||
throw new Error("Unauthorized. Please check your API key.");
|
||||
}
|
||||
|
||||
const data = response.data;
|
||||
|
||||
if (data && Array.isArray(data.organic_results)) {
|
||||
return data.organic_results.map((a: any) => ({
|
||||
url: a.link,
|
||||
title: a.title,
|
||||
description: a.snippet,
|
||||
}));
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`There was an error searching for content: ${error.message}`);
|
||||
return [];
|
||||
}
|
||||
}
|
176
apps/api/src/services/billing/auto_charge.ts
Normal file
176
apps/api/src/services/billing/auto_charge.ts
Normal file
|
@ -0,0 +1,176 @@
|
|||
// Import necessary dependencies and types
|
||||
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
|
||||
import { getACUC, setCachedACUC } from "../../controllers/auth";
|
||||
import { redlock } from "../redlock";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { createPaymentIntent } from "./stripe";
|
||||
import { issueCredits } from "./issue_credits";
|
||||
import { sendNotification } from "../notification/email_notification";
|
||||
import { NotificationType } from "../../types";
|
||||
import { deleteKey, getValue, setValue } from "../redis";
|
||||
import { sendSlackWebhook } from "../alerts/slack";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
// Define the number of credits to be added during auto-recharge
|
||||
const AUTO_RECHARGE_CREDITS = 1000;
|
||||
const AUTO_RECHARGE_COOLDOWN = 300; // 5 minutes in seconds
|
||||
|
||||
/**
|
||||
* Attempt to automatically charge a user's account when their credit balance falls below a threshold
|
||||
* @param chunk The user's current usage data
|
||||
* @param autoRechargeThreshold The credit threshold that triggers auto-recharge
|
||||
*/
|
||||
export async function autoCharge(
|
||||
chunk: AuthCreditUsageChunk,
|
||||
autoRechargeThreshold: number
|
||||
): Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> {
|
||||
const resource = `auto-recharge:${chunk.team_id}`;
|
||||
const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`;
|
||||
|
||||
try {
|
||||
// Check if the team is in the cooldown period
|
||||
// Another check to prevent race conditions, double charging - cool down of 5 minutes
|
||||
const cooldownValue = await getValue(cooldownKey);
|
||||
if (cooldownValue) {
|
||||
Logger.info(`Auto-recharge for team ${chunk.team_id} is in cooldown period`);
|
||||
return {
|
||||
success: false,
|
||||
message: "Auto-recharge is in cooldown period",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
|
||||
// Use a distributed lock to prevent concurrent auto-charge attempts
|
||||
return await redlock.using([resource], 5000, async (signal) : Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> => {
|
||||
// Recheck the condition inside the lock to prevent race conditions
|
||||
const updatedChunk = await getACUC(chunk.api_key, false, false);
|
||||
if (
|
||||
updatedChunk &&
|
||||
updatedChunk.remaining_credits < autoRechargeThreshold
|
||||
) {
|
||||
if (chunk.sub_user_id) {
|
||||
// Fetch the customer's Stripe information
|
||||
const { data: customer, error: customersError } =
|
||||
await supabase_service
|
||||
.from("customers")
|
||||
.select("id, stripe_customer_id")
|
||||
.eq("id", chunk.sub_user_id)
|
||||
.single();
|
||||
|
||||
if (customersError) {
|
||||
Logger.error(`Error fetching customer data: ${customersError}`);
|
||||
return {
|
||||
success: false,
|
||||
message: "Error fetching customer data",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
|
||||
if (customer && customer.stripe_customer_id) {
|
||||
let issueCreditsSuccess = false;
|
||||
// Attempt to create a payment intent
|
||||
const paymentStatus = await createPaymentIntent(
|
||||
chunk.team_id,
|
||||
customer.stripe_customer_id
|
||||
);
|
||||
|
||||
// If payment is successful or requires further action, issue credits
|
||||
if (
|
||||
paymentStatus.return_status === "succeeded" ||
|
||||
paymentStatus.return_status === "requires_action"
|
||||
) {
|
||||
issueCreditsSuccess = await issueCredits(
|
||||
chunk.team_id,
|
||||
AUTO_RECHARGE_CREDITS
|
||||
);
|
||||
}
|
||||
|
||||
// Record the auto-recharge transaction
|
||||
await supabase_service.from("auto_recharge_transactions").insert({
|
||||
team_id: chunk.team_id,
|
||||
initial_payment_status: paymentStatus.return_status,
|
||||
credits_issued: issueCreditsSuccess ? AUTO_RECHARGE_CREDITS : 0,
|
||||
stripe_charge_id: paymentStatus.charge_id,
|
||||
});
|
||||
|
||||
// Send a notification if credits were successfully issued
|
||||
if (issueCreditsSuccess) {
|
||||
await sendNotification(
|
||||
chunk.team_id,
|
||||
NotificationType.AUTO_RECHARGE_SUCCESS,
|
||||
chunk.sub_current_period_start,
|
||||
chunk.sub_current_period_end,
|
||||
chunk,
|
||||
true
|
||||
);
|
||||
|
||||
// Set cooldown period
|
||||
await setValue(cooldownKey, 'true', AUTO_RECHARGE_COOLDOWN);
|
||||
}
|
||||
|
||||
// Reset ACUC cache to reflect the new credit balance
|
||||
const cacheKeyACUC = `acuc_${chunk.api_key}`;
|
||||
await deleteKey(cacheKeyACUC);
|
||||
|
||||
if (process.env.SLACK_ADMIN_WEBHOOK_URL) {
|
||||
const webhookCooldownKey = `webhook_cooldown_${chunk.team_id}`;
|
||||
const isInCooldown = await getValue(webhookCooldownKey);
|
||||
|
||||
if (!isInCooldown) {
|
||||
sendSlackWebhook(
|
||||
`Auto-recharge: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}.`,
|
||||
false,
|
||||
process.env.SLACK_ADMIN_WEBHOOK_URL
|
||||
).catch((error) => {
|
||||
Logger.debug(`Error sending slack notification: ${error}`);
|
||||
});
|
||||
|
||||
// Set cooldown for 1 hour
|
||||
await setValue(webhookCooldownKey, 'true', 60 * 60);
|
||||
}
|
||||
}
|
||||
return {
|
||||
success: true,
|
||||
message: "Auto-recharge successful",
|
||||
remainingCredits: chunk.remaining_credits + AUTO_RECHARGE_CREDITS,
|
||||
chunk: {...chunk, remaining_credits: chunk.remaining_credits + AUTO_RECHARGE_CREDITS},
|
||||
};
|
||||
} else {
|
||||
Logger.error("No Stripe customer ID found for user");
|
||||
return {
|
||||
success: false,
|
||||
message: "No Stripe customer ID found for user",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
} else {
|
||||
Logger.error("No sub_user_id found in chunk");
|
||||
return {
|
||||
success: false,
|
||||
message: "No sub_user_id found in chunk",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
}
|
||||
return {
|
||||
success: false,
|
||||
message: "No need to auto-recharge",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to acquire lock for auto-recharge: ${error}`);
|
||||
return {
|
||||
success: false,
|
||||
message: "Failed to acquire lock for auto-recharge",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
}
|
|
@ -6,24 +6,40 @@ import { Logger } from "../../lib/logger";
|
|||
import * as Sentry from "@sentry/node";
|
||||
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
|
||||
import { getACUC, setCachedACUC } from "../../controllers/auth";
|
||||
import { issueCredits } from "./issue_credits";
|
||||
import { redlock } from "../redlock";
|
||||
import { autoCharge } from "./auto_charge";
|
||||
import { getValue, setValue } from "../redis";
|
||||
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
/**
|
||||
* If you do not know the subscription_id in the current context, pass subscription_id as undefined.
|
||||
*/
|
||||
export async function billTeam(team_id: string, subscription_id: string | null | undefined, credits: number) {
|
||||
export async function billTeam(
|
||||
team_id: string,
|
||||
subscription_id: string | null | undefined,
|
||||
credits: number
|
||||
) {
|
||||
return withAuth(supaBillTeam)(team_id, subscription_id, credits);
|
||||
}
|
||||
export async function supaBillTeam(team_id: string, subscription_id: string, credits: number) {
|
||||
export async function supaBillTeam(
|
||||
team_id: string,
|
||||
subscription_id: string,
|
||||
credits: number
|
||||
) {
|
||||
if (team_id === "preview") {
|
||||
return { success: true, message: "Preview team, no credits used" };
|
||||
}
|
||||
Logger.info(`Billing team ${team_id} for ${credits} credits`);
|
||||
|
||||
const { data, error } =
|
||||
await supabase_service.rpc("bill_team", { _team_id: team_id, sub_id: subscription_id ?? null, fetch_subscription: subscription_id === undefined, credits });
|
||||
|
||||
const { data, error } = await supabase_service.rpc("bill_team", {
|
||||
_team_id: team_id,
|
||||
sub_id: subscription_id ?? null,
|
||||
fetch_subscription: subscription_id === undefined,
|
||||
credits,
|
||||
});
|
||||
|
||||
if (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error("Failed to bill team: " + JSON.stringify(error));
|
||||
|
@ -31,53 +47,126 @@ export async function supaBillTeam(team_id: string, subscription_id: string, cre
|
|||
}
|
||||
|
||||
(async () => {
|
||||
for (const apiKey of (data ?? []).map(x => x.api_key)) {
|
||||
await setCachedACUC(apiKey, acuc => (acuc ? {
|
||||
...acuc,
|
||||
credits_used: acuc.credits_used + credits,
|
||||
adjusted_credits_used: acuc.adjusted_credits_used + credits,
|
||||
remaining_credits: acuc.remaining_credits - credits,
|
||||
} : null));
|
||||
for (const apiKey of (data ?? []).map((x) => x.api_key)) {
|
||||
await setCachedACUC(apiKey, (acuc) =>
|
||||
acuc
|
||||
? {
|
||||
...acuc,
|
||||
credits_used: acuc.credits_used + credits,
|
||||
adjusted_credits_used: acuc.adjusted_credits_used + credits,
|
||||
remaining_credits: acuc.remaining_credits - credits,
|
||||
}
|
||||
: null
|
||||
);
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
export async function checkTeamCredits(chunk: AuthCreditUsageChunk, team_id: string, credits: number) {
|
||||
return withAuth(supaCheckTeamCredits)(chunk, team_id, credits);
|
||||
export async function checkTeamCredits(
|
||||
chunk: AuthCreditUsageChunk,
|
||||
team_id: string,
|
||||
credits: number
|
||||
): Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> {
|
||||
const result = await withAuth(supaCheckTeamCredits)(chunk, team_id, credits);
|
||||
return {
|
||||
success: result.success,
|
||||
message: result.message,
|
||||
remainingCredits: result.remainingCredits,
|
||||
chunk: chunk // Ensure chunk is always returned
|
||||
};
|
||||
}
|
||||
|
||||
// if team has enough credits for the operation, return true, else return false
|
||||
export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: string, credits: number) {
|
||||
export async function supaCheckTeamCredits(
|
||||
chunk: AuthCreditUsageChunk,
|
||||
team_id: string,
|
||||
credits: number
|
||||
) {
|
||||
// WARNING: chunk will be null if team_id is preview -- do not perform operations on it under ANY circumstances - mogery
|
||||
if (team_id === "preview") {
|
||||
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
|
||||
return {
|
||||
success: true,
|
||||
message: "Preview team, no credits used",
|
||||
remainingCredits: Infinity,
|
||||
};
|
||||
}
|
||||
|
||||
const creditsWillBeUsed = chunk.adjusted_credits_used + credits;
|
||||
|
||||
// In case chunk.price_credits is undefined, set it to a large number to avoid mistakes
|
||||
const totalPriceCredits = chunk.total_credits_sum ?? 100000000;
|
||||
// Removal of + credits
|
||||
const creditUsagePercentage = creditsWillBeUsed / chunk.price_credits;
|
||||
const creditUsagePercentage = chunk.adjusted_credits_used / totalPriceCredits;
|
||||
|
||||
let isAutoRechargeEnabled = false, autoRechargeThreshold = 1000;
|
||||
const cacheKey = `team_auto_recharge_${team_id}`;
|
||||
let cachedData = await getValue(cacheKey);
|
||||
if (cachedData) {
|
||||
const parsedData = JSON.parse(cachedData);
|
||||
isAutoRechargeEnabled = parsedData.auto_recharge;
|
||||
autoRechargeThreshold = parsedData.auto_recharge_threshold;
|
||||
} else {
|
||||
const { data, error } = await supabase_service
|
||||
.from("teams")
|
||||
.select("auto_recharge, auto_recharge_threshold")
|
||||
.eq("id", team_id)
|
||||
.single();
|
||||
|
||||
if (data) {
|
||||
isAutoRechargeEnabled = data.auto_recharge;
|
||||
autoRechargeThreshold = data.auto_recharge_threshold;
|
||||
await setValue(cacheKey, JSON.stringify(data), 300); // Cache for 5 minutes (300 seconds)
|
||||
}
|
||||
}
|
||||
|
||||
if (isAutoRechargeEnabled && chunk.remaining_credits < autoRechargeThreshold) {
|
||||
const autoChargeResult = await autoCharge(chunk, autoRechargeThreshold);
|
||||
if (autoChargeResult.success) {
|
||||
return {
|
||||
success: true,
|
||||
message: autoChargeResult.message,
|
||||
remainingCredits: autoChargeResult.remainingCredits,
|
||||
chunk: autoChargeResult.chunk,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Compare the adjusted total credits used with the credits allowed by the plan
|
||||
if (creditsWillBeUsed > chunk.price_credits) {
|
||||
sendNotification(
|
||||
team_id,
|
||||
NotificationType.LIMIT_REACHED,
|
||||
chunk.sub_current_period_start,
|
||||
chunk.sub_current_period_end
|
||||
);
|
||||
return { success: false, message: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk };
|
||||
if (creditsWillBeUsed > totalPriceCredits) {
|
||||
// Only notify if their actual credits (not what they will use) used is greater than the total price credits
|
||||
if (chunk.adjusted_credits_used > totalPriceCredits) {
|
||||
sendNotification(
|
||||
team_id,
|
||||
NotificationType.LIMIT_REACHED,
|
||||
chunk.sub_current_period_start,
|
||||
chunk.sub_current_period_end,
|
||||
chunk
|
||||
);
|
||||
}
|
||||
return {
|
||||
success: false,
|
||||
message:
|
||||
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
||||
// Send email notification for approaching credit limit
|
||||
sendNotification(
|
||||
team_id,
|
||||
NotificationType.APPROACHING_LIMIT,
|
||||
chunk.sub_current_period_start,
|
||||
chunk.sub_current_period_end
|
||||
chunk.sub_current_period_end,
|
||||
chunk
|
||||
);
|
||||
}
|
||||
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: chunk.remaining_credits, chunk };
|
||||
return {
|
||||
success: true,
|
||||
message: "Sufficient credits available",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
|
||||
// Count the total credits used by a team within the current billing period and return the remaining credits.
|
||||
|
|
20
apps/api/src/services/billing/issue_credits.ts
Normal file
20
apps/api/src/services/billing/issue_credits.ts
Normal file
|
@ -0,0 +1,20 @@
|
|||
import { Logger } from "../../lib/logger";
|
||||
import { supabase_service } from "../supabase";
|
||||
|
||||
export async function issueCredits(team_id: string, credits: number) {
|
||||
// Add an entry to supabase coupons
|
||||
const { data, error } = await supabase_service.from("coupons").insert({
|
||||
team_id: team_id,
|
||||
credits: credits,
|
||||
status: "active",
|
||||
// indicates that this coupon was issued from auto recharge
|
||||
from_auto_recharge: true,
|
||||
});
|
||||
|
||||
if (error) {
|
||||
Logger.error(`Error adding coupon: ${error}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
56
apps/api/src/services/billing/stripe.ts
Normal file
56
apps/api/src/services/billing/stripe.ts
Normal file
|
@ -0,0 +1,56 @@
|
|||
import { Logger } from "../../lib/logger";
|
||||
import Stripe from "stripe";
|
||||
|
||||
const stripe = new Stripe(process.env.STRIPE_SECRET_KEY ?? "");
|
||||
|
||||
async function getCustomerDefaultPaymentMethod(customerId: string) {
|
||||
const paymentMethods = await stripe.customers.listPaymentMethods(customerId, {
|
||||
limit: 3,
|
||||
});
|
||||
return paymentMethods.data[0] ?? null;
|
||||
}
|
||||
|
||||
type ReturnStatus = "succeeded" | "requires_action" | "failed";
|
||||
export async function createPaymentIntent(
|
||||
team_id: string,
|
||||
customer_id: string
|
||||
): Promise<{ return_status: ReturnStatus; charge_id: string }> {
|
||||
try {
|
||||
const defaultPaymentMethod = await getCustomerDefaultPaymentMethod(customer_id);
|
||||
if (!defaultPaymentMethod) {
|
||||
Logger.error(`No default payment method found for customer: ${customer_id}`);
|
||||
return { return_status: "failed", charge_id: "" };
|
||||
}
|
||||
const paymentIntent = await stripe.paymentIntents.create({
|
||||
amount: 1100,
|
||||
currency: "usd",
|
||||
customer: customer_id,
|
||||
description: "Firecrawl: Auto re-charge of 1000 credits",
|
||||
payment_method_types: [defaultPaymentMethod?.type ?? "card"],
|
||||
payment_method: defaultPaymentMethod?.id,
|
||||
off_session: true,
|
||||
confirm: true,
|
||||
});
|
||||
|
||||
if (paymentIntent.status === "succeeded") {
|
||||
Logger.info(`Payment succeeded for team: ${team_id}`);
|
||||
return { return_status: "succeeded", charge_id: paymentIntent.id };
|
||||
} else if (
|
||||
paymentIntent.status === "requires_action" ||
|
||||
paymentIntent.status === "processing" ||
|
||||
paymentIntent.status === "requires_capture"
|
||||
) {
|
||||
Logger.warn(`Payment requires further action for team: ${team_id}`);
|
||||
return { return_status: "requires_action", charge_id: paymentIntent.id };
|
||||
} else {
|
||||
Logger.error(`Payment failed for team: ${team_id}`);
|
||||
return { return_status: "failed", charge_id: paymentIntent.id };
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(
|
||||
`Failed to create or confirm PaymentIntent for team: ${team_id}`
|
||||
);
|
||||
console.error(error);
|
||||
return { return_status: "failed", charge_id: "" };
|
||||
}
|
||||
}
|
|
@ -70,7 +70,9 @@ export async function logJob(job: FirecrawlJob) {
|
|||
retry: job.retry,
|
||||
},
|
||||
};
|
||||
posthog.capture(phLog);
|
||||
if(job.mode !== "single_urls") {
|
||||
posthog.capture(phLog);
|
||||
}
|
||||
}
|
||||
if (error) {
|
||||
Logger.error(`Error logging job: ${error.message}`);
|
||||
|
|
|
@ -3,6 +3,9 @@ import { withAuth } from "../../lib/withAuth";
|
|||
import { Resend } from "resend";
|
||||
import { NotificationType } from "../../types";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { sendSlackWebhook } from "../alerts/slack";
|
||||
import { getNotificationString } from "./notification_string";
|
||||
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
|
||||
|
||||
const emailTemplates: Record<
|
||||
NotificationType,
|
||||
|
@ -21,25 +24,37 @@ const emailTemplates: Record<
|
|||
subject: "Rate Limit Reached - Firecrawl",
|
||||
html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
|
||||
},
|
||||
[NotificationType.AUTO_RECHARGE_SUCCESS]: {
|
||||
subject: "Auto recharge successful - Firecrawl",
|
||||
html: "Hey there,<br/><p>Your account was successfully recharged with 1000 credits because your remaining credits were below the threshold. Consider upgrading your plan at <a href='https://firecrawl.dev/pricing'>firecrawl.dev/pricing</a> to avoid hitting the limit.</p><br/>Thanks,<br/>Firecrawl Team<br/>",
|
||||
},
|
||||
[NotificationType.AUTO_RECHARGE_FAILED]: {
|
||||
subject: "Auto recharge failed - Firecrawl",
|
||||
html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
|
||||
},
|
||||
};
|
||||
|
||||
export async function sendNotification(
|
||||
team_id: string,
|
||||
notificationType: NotificationType,
|
||||
startDateString: string,
|
||||
endDateString: string
|
||||
endDateString: string,
|
||||
chunk: AuthCreditUsageChunk,
|
||||
bypassRecentChecks: boolean = false
|
||||
) {
|
||||
return withAuth(sendNotificationInternal)(
|
||||
team_id,
|
||||
notificationType,
|
||||
startDateString,
|
||||
endDateString
|
||||
endDateString,
|
||||
chunk,
|
||||
bypassRecentChecks
|
||||
);
|
||||
}
|
||||
|
||||
async function sendEmailNotification(
|
||||
export async function sendEmailNotification(
|
||||
email: string,
|
||||
notificationType: NotificationType
|
||||
notificationType: NotificationType,
|
||||
) {
|
||||
const resend = new Resend(process.env.RESEND_API_KEY);
|
||||
|
||||
|
@ -66,80 +81,95 @@ export async function sendNotificationInternal(
|
|||
team_id: string,
|
||||
notificationType: NotificationType,
|
||||
startDateString: string,
|
||||
endDateString: string
|
||||
endDateString: string,
|
||||
chunk: AuthCreditUsageChunk,
|
||||
bypassRecentChecks: boolean = false
|
||||
): Promise<{ success: boolean }> {
|
||||
if (team_id === "preview") {
|
||||
return { success: true };
|
||||
}
|
||||
|
||||
const fifteenDaysAgo = new Date();
|
||||
fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15);
|
||||
if (!bypassRecentChecks) {
|
||||
const fifteenDaysAgo = new Date();
|
||||
fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15);
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("notification_type", notificationType)
|
||||
.gte("sent_date", fifteenDaysAgo.toISOString());
|
||||
|
||||
if (error) {
|
||||
Logger.debug(`Error fetching notifications: ${error}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
if (data.length !== 0) {
|
||||
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} in the last 15 days`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
const { data: recentData, error: recentError } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("notification_type", notificationType)
|
||||
.gte("sent_date", startDateString)
|
||||
.lte("sent_date", endDateString);
|
||||
|
||||
if (recentError) {
|
||||
Logger.debug(`Error fetching recent notifications: ${recentError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
if (recentData.length !== 0) {
|
||||
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} within the specified date range`);
|
||||
return { success: false };
|
||||
} else {
|
||||
console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`);
|
||||
// get the emails from the user with the team_id
|
||||
const { data: emails, error: emailsError } = await supabase_service
|
||||
.from("users")
|
||||
.select("email")
|
||||
.eq("team_id", team_id);
|
||||
|
||||
if (emailsError) {
|
||||
Logger.debug(`Error fetching emails: ${emailsError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
for (const email of emails) {
|
||||
await sendEmailNotification(email.email, notificationType);
|
||||
}
|
||||
|
||||
const { error: insertError } = await supabase_service
|
||||
const { data, error } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.insert([
|
||||
{
|
||||
team_id: team_id,
|
||||
notification_type: notificationType,
|
||||
sent_date: new Date().toISOString(),
|
||||
},
|
||||
]);
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("notification_type", notificationType)
|
||||
.gte("sent_date", fifteenDaysAgo.toISOString());
|
||||
|
||||
if (insertError) {
|
||||
Logger.debug(`Error inserting notification record: ${insertError}`);
|
||||
if (error) {
|
||||
Logger.debug(`Error fetching notifications: ${error}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
return { success: true };
|
||||
if (data.length !== 0) {
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
// TODO: observation: Free credits people are not receiving notifications
|
||||
|
||||
const { data: recentData, error: recentError } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("notification_type", notificationType)
|
||||
.gte("sent_date", startDateString)
|
||||
.lte("sent_date", endDateString);
|
||||
|
||||
if (recentError) {
|
||||
Logger.debug(`Error fetching recent notifications: ${recentError.message}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
if (recentData.length !== 0) {
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`);
|
||||
// get the emails from the user with the team_id
|
||||
const { data: emails, error: emailsError } = await supabase_service
|
||||
.from("users")
|
||||
.select("email")
|
||||
.eq("team_id", team_id);
|
||||
|
||||
if (emailsError) {
|
||||
Logger.debug(`Error fetching emails: ${emailsError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
for (const email of emails) {
|
||||
await sendEmailNotification(email.email, notificationType);
|
||||
}
|
||||
|
||||
const { error: insertError } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.insert([
|
||||
{
|
||||
team_id: team_id,
|
||||
notification_type: notificationType,
|
||||
sent_date: new Date().toISOString(),
|
||||
},
|
||||
]);
|
||||
|
||||
if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) {
|
||||
sendSlackWebhook(
|
||||
`${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}. Number of credits used: ${chunk.adjusted_credits_used} | Number of credits in the plan: ${chunk.price_credits}`,
|
||||
false,
|
||||
process.env.SLACK_ADMIN_WEBHOOK_URL
|
||||
).catch((error) => {
|
||||
Logger.debug(`Error sending slack notification: ${error}`);
|
||||
});
|
||||
}
|
||||
|
||||
if (insertError) {
|
||||
Logger.debug(`Error inserting notification record: ${insertError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
return { success: true };
|
||||
}
|
||||
|
|
21
apps/api/src/services/notification/notification_string.ts
Normal file
21
apps/api/src/services/notification/notification_string.ts
Normal file
|
@ -0,0 +1,21 @@
|
|||
import { NotificationType } from "../../types";
|
||||
|
||||
// depending on the notification type, return the appropriate string
|
||||
export function getNotificationString(
|
||||
notificationType: NotificationType
|
||||
): string {
|
||||
switch (notificationType) {
|
||||
case NotificationType.APPROACHING_LIMIT:
|
||||
return "Approaching the limit (80%)";
|
||||
case NotificationType.LIMIT_REACHED:
|
||||
return "Limit reached (100%)";
|
||||
case NotificationType.RATE_LIMIT_REACHED:
|
||||
return "Rate limit reached";
|
||||
case NotificationType.AUTO_RECHARGE_SUCCESS:
|
||||
return "Auto-recharge successful";
|
||||
case NotificationType.AUTO_RECHARGE_FAILED:
|
||||
return "Auto-recharge failed";
|
||||
default:
|
||||
return "Unknown notification type";
|
||||
}
|
||||
}
|
|
@ -329,7 +329,8 @@ async function processJob(job: Job, token: string) {
|
|||
job.id as string,
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1
|
||||
job.data.v1,
|
||||
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
|
||||
);
|
||||
}
|
||||
if (job.data.webhook && job.data.mode !== "crawl" && job.data.v1) {
|
||||
|
@ -339,7 +340,7 @@ async function processJob(job: Job, token: string) {
|
|||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.page",
|
||||
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
|
||||
true
|
||||
);
|
||||
}
|
||||
|
@ -365,7 +366,7 @@ async function processJob(job: Job, token: string) {
|
|||
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
|
||||
if (!job.data.sitemapped) {
|
||||
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
|
||||
|
@ -415,8 +416,6 @@ async function processJob(job: Job, token: string) {
|
|||
}
|
||||
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
|
||||
|
||||
if (!job.data.v1) {
|
||||
const jobIDs = await getCrawlJobs(job.data.crawl_id);
|
||||
|
||||
|
@ -439,7 +438,7 @@ async function processJob(job: Job, token: string) {
|
|||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
|
||||
url: sc.originUrl,
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
|
@ -469,7 +468,7 @@ async function processJob(job: Job, token: string) {
|
|||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.completed"
|
||||
job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed"
|
||||
);
|
||||
}
|
||||
} else {
|
||||
|
@ -487,7 +486,7 @@ async function processJob(job: Job, token: string) {
|
|||
[],
|
||||
job.data.webhook,
|
||||
job.data.v1,
|
||||
"crawl.completed"
|
||||
job.data.crawlerOptions !== null ? "crawl.completed" : "batch_scrape.completed"
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -499,8 +498,8 @@ async function processJob(job: Job, token: string) {
|
|||
docs: [],
|
||||
time_taken: (Date.now() - sc.createdAt) / 1000,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
url: sc.originUrl,
|
||||
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
|
||||
url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"),
|
||||
crawlerOptions: sc.crawlerOptions,
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
|
@ -556,7 +555,8 @@ async function processJob(job: Job, token: string) {
|
|||
job.data.crawl_id ?? (job.id as string),
|
||||
data,
|
||||
job.data.webhook,
|
||||
job.data.v1
|
||||
job.data.v1,
|
||||
job.data.crawlerOptions !== null ? "crawl.page" : "batch_scrape.page",
|
||||
);
|
||||
}
|
||||
// if (job.data.v1) {
|
||||
|
@ -605,7 +605,7 @@ async function processJob(job: Job, token: string) {
|
|||
docs: [],
|
||||
time_taken: 0,
|
||||
team_id: job.data.team_id,
|
||||
mode: "crawl",
|
||||
mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
|
||||
url: sc ? sc.originUrl : job.data.url,
|
||||
crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions,
|
||||
pageOptions: sc ? sc.pageOptions : job.data.pageOptions,
|
||||
|
|
|
@ -130,6 +130,8 @@ export enum NotificationType {
|
|||
APPROACHING_LIMIT = "approachingLimit",
|
||||
LIMIT_REACHED = "limitReached",
|
||||
RATE_LIMIT_REACHED = "rateLimitReached",
|
||||
AUTO_RECHARGE_SUCCESS = "autoRechargeSuccess",
|
||||
AUTO_RECHARGE_FAILED = "autoRechargeFailed",
|
||||
}
|
||||
|
||||
export type ScrapeLog = {
|
||||
|
@ -159,4 +161,4 @@ export type PlanType =
|
|||
| "";
|
||||
|
||||
|
||||
export type WebhookEventType = "crawl.page" | "crawl.started" | "crawl.completed" | "crawl.failed";
|
||||
export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";
|
|
@ -6,7 +6,7 @@
|
|||
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
|
||||
"contact": {
|
||||
"name": "Firecrawl Support",
|
||||
"url": "https://firecrawl.dev",
|
||||
"url": "https://firecrawl.dev/support",
|
||||
"email": "support@firecrawl.dev"
|
||||
}
|
||||
},
|
||||
|
@ -97,6 +97,127 @@
|
|||
"description": "The prompt to use for the extraction without a schema (Optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"actions": {
|
||||
"type": "array",
|
||||
"description": "Actions to perform on the page before grabbing the content",
|
||||
"items": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "object",
|
||||
"title": "Wait",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["wait"],
|
||||
"description": "Wait for a specified amount of milliseconds"
|
||||
},
|
||||
"milliseconds": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"description": "Number of milliseconds to wait"
|
||||
}
|
||||
},
|
||||
"required": ["type", "milliseconds"]
|
||||
},
|
||||
{
|
||||
"type": "object",
|
||||
"title": "Screenshot",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["screenshot"],
|
||||
"description": "Take a screenshot"
|
||||
},
|
||||
"fullPage": {
|
||||
"type": "boolean",
|
||||
"description": "Should the screenshot be full-page or viewport sized?",
|
||||
"default": false
|
||||
}
|
||||
},
|
||||
"required": ["type"]
|
||||
},
|
||||
{
|
||||
"type": "object",
|
||||
"title": "Click",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["click"],
|
||||
"description": "Click on an element"
|
||||
},
|
||||
"selector": {
|
||||
"type": "string",
|
||||
"description": "Query selector to find the element by",
|
||||
"example": "#load-more-button"
|
||||
}
|
||||
},
|
||||
"required": ["type", "selector"]
|
||||
},
|
||||
{
|
||||
"type": "object",
|
||||
"title": "Write text",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["write"],
|
||||
"description": "Write text into an input field"
|
||||
},
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "Text to type",
|
||||
"example": "Hello, world!"
|
||||
},
|
||||
"selector": {
|
||||
"type": "string",
|
||||
"description": "Query selector for the input field",
|
||||
"example": "#search-input"
|
||||
}
|
||||
},
|
||||
"required": ["type", "text", "selector"]
|
||||
},
|
||||
{
|
||||
"type": "object",
|
||||
"title": "Press a key",
|
||||
"description": "Press a key on the page. See https://asawicki.info/nosense/doc/devices/keyboard/key_codes.html for key codes.",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["press"],
|
||||
"description": "Press a key on the page"
|
||||
},
|
||||
"key": {
|
||||
"type": "string",
|
||||
"description": "Key to press",
|
||||
"example": "Enter"
|
||||
}
|
||||
},
|
||||
"required": ["type", "key"]
|
||||
},
|
||||
{
|
||||
"type": "object",
|
||||
"title": "Scroll",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["scroll"],
|
||||
"description": "Scroll the page"
|
||||
},
|
||||
"direction": {
|
||||
"type": "string",
|
||||
"enum": ["up", "down"],
|
||||
"description": "Direction to scroll"
|
||||
},
|
||||
"amount": {
|
||||
"type": "integer",
|
||||
"description": "Amount to scroll in pixels",
|
||||
"minimum": 1
|
||||
}
|
||||
},
|
||||
"required": ["type", "direction"]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
|
@ -341,14 +462,14 @@
|
|||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to exclude"
|
||||
"description": "Specifies URL patterns to exclude from the crawl by comparing website paths against the provided regex patterns. For example, if you set \"excludePaths\": [\"blog/*\"] for the base URL firecrawl.dev, any results matching that pattern will be excluded, such as https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap."
|
||||
},
|
||||
"includePaths": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to include"
|
||||
"description": "Specifies URL patterns to include in the crawl by comparing website paths against the provided regex patterns. Only the paths that match the specified patterns will be included in the response. For example, if you set \"includePaths\": [\"blog/*\"] for the base URL firecrawl.dev, only results matching that pattern will be included, such as https://www.firecrawl.dev/blog/firecrawl-launch-week-1-recap."
|
||||
},
|
||||
"maxDepth": {
|
||||
"type": "integer",
|
||||
|
@ -362,7 +483,7 @@
|
|||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of pages to crawl",
|
||||
"description": "Maximum number of pages to crawl. Default limit is 10000.",
|
||||
"default": 10
|
||||
},
|
||||
"allowBackwardLinks": {
|
||||
|
@ -513,7 +634,7 @@
|
|||
},
|
||||
"search": {
|
||||
"type": "string",
|
||||
"description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 100 search results. However, if map finds more results, there is no limit applied."
|
||||
"description": "Search query to use for mapping. During the Alpha phase, the 'smart' part of the search functionality is limited to 1000 search results. However, if map finds more results, there is no limit applied."
|
||||
},
|
||||
"ignoreSitemap": {
|
||||
"type": "boolean",
|
||||
|
@ -642,6 +763,21 @@
|
|||
},
|
||||
"description": "List of links on the page if `links` is in `formats`"
|
||||
},
|
||||
"actions": {
|
||||
"type": "object",
|
||||
"nullable": true,
|
||||
"description": "Results of the actions specified in the `actions` parameter. Only present if the `actions` parameter was provided in the request",
|
||||
"properties": {
|
||||
"screenshots": {
|
||||
"type": "array",
|
||||
"description": "Screenshot URLs, in the same order as the screenshot actions provided.",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "url"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
|
@ -145,6 +145,46 @@ watch.addEventListener("done", state => {
|
|||
});
|
||||
```
|
||||
|
||||
### Batch scraping multiple URLs
|
||||
|
||||
To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
|
||||
|
||||
```js
|
||||
const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
|
||||
formats: ['markdown', 'html'],
|
||||
})
|
||||
```
|
||||
|
||||
|
||||
#### Asynchronous batch scrape
|
||||
|
||||
To initiate an asynchronous batch scrape, utilize the `asyncBatchScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape.
|
||||
|
||||
```js
|
||||
const asyncBatchScrapeResult = await app.asyncBatchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
|
||||
```
|
||||
|
||||
#### Batch scrape with WebSockets
|
||||
|
||||
To use batch scrape with WebSockets, use the `batchScrapeUrlsAndWatch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
|
||||
|
||||
```js
|
||||
// Batch scrape multiple URLs with WebSockets:
|
||||
const watch = await app.batchScrapeUrlsAndWatch(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
|
||||
|
||||
watch.addEventListener("document", doc => {
|
||||
console.log("DOC", doc.detail);
|
||||
});
|
||||
|
||||
watch.addEventListener("error", err => {
|
||||
console.error("ERR", err.detail.error);
|
||||
});
|
||||
|
||||
watch.addEventListener("done", state => {
|
||||
console.log("DONE", state.detail.status);
|
||||
});
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "firecrawl",
|
||||
"version": "1.6.1",
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.7.2",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
|
|
@ -82,6 +82,10 @@ export interface CrawlScrapeOptions {
|
|||
onlyMainContent?: boolean;
|
||||
waitFor?: number;
|
||||
timeout?: number;
|
||||
location?: {
|
||||
country?: string;
|
||||
languages?: string[];
|
||||
};
|
||||
}
|
||||
|
||||
export type Action = {
|
||||
|
@ -154,6 +158,17 @@ export interface CrawlResponse {
|
|||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for batch scrape operations.
|
||||
* Defines the structure of the response received after initiating a crawl.
|
||||
*/
|
||||
export interface BatchScrapeResponse {
|
||||
id?: string;
|
||||
url?: string;
|
||||
success: true;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for job status checks.
|
||||
* Provides detailed status of a crawl job including progress and results.
|
||||
|
@ -169,6 +184,21 @@ export interface CrawlStatusResponse {
|
|||
data: FirecrawlDocument<undefined>[];
|
||||
};
|
||||
|
||||
/**
|
||||
* Response interface for batch scrape job status checks.
|
||||
* Provides detailed status of a batch scrape job including progress and results.
|
||||
*/
|
||||
export interface BatchScrapeStatusResponse {
|
||||
success: true;
|
||||
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||
completed: number;
|
||||
total: number;
|
||||
creditsUsed: number;
|
||||
expiresAt: Date;
|
||||
next?: string;
|
||||
data: FirecrawlDocument<undefined>[];
|
||||
};
|
||||
|
||||
/**
|
||||
* Parameters for mapping operations.
|
||||
* Defines options for mapping URLs during a crawl.
|
||||
|
@ -493,6 +523,144 @@ export default class FirecrawlApp {
|
|||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a batch scrape job for multiple URLs using the Firecrawl API.
|
||||
* @param url - The URLs to scrape.
|
||||
* @param params - Additional parameters for the scrape request.
|
||||
* @param pollInterval - Time in seconds for job status checks.
|
||||
* @param idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns The response from the crawl operation.
|
||||
*/
|
||||
async batchScrapeUrls(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
pollInterval: number = 2,
|
||||
idempotencyKey?: string
|
||||
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, ...(params ?? {}) };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/batch/scrape`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
const id: string = response.data.id;
|
||||
return this.monitorJobStatus(id, headers, pollInterval);
|
||||
} else {
|
||||
this.handleError(response, "start batch scrape job");
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error.response?.data?.error) {
|
||||
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||
} else {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
async asyncBatchScrapeUrls(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
idempotencyKey?: string
|
||||
): Promise<BatchScrapeResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, ...(params ?? {}) };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/batch/scrape`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
return response.data;
|
||||
} else {
|
||||
this.handleError(response, "start batch scrape job");
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error.response?.data?.error) {
|
||||
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||
} else {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
||||
* @param urls - The URL to scrape.
|
||||
* @param params - Additional parameters for the scrape request.
|
||||
* @param idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns A CrawlWatcher instance to monitor the crawl job.
|
||||
*/
|
||||
async batchScrapeUrlsAndWatch(
|
||||
urls: string[],
|
||||
params?: ScrapeParams,
|
||||
idempotencyKey?: string,
|
||||
) {
|
||||
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
||||
|
||||
if (crawl.success && crawl.id) {
|
||||
const id = crawl.id;
|
||||
return new CrawlWatcher(id, this);
|
||||
}
|
||||
|
||||
throw new FirecrawlError("Batch scrape job failed to start", 400);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the status of a batch scrape job using the Firecrawl API.
|
||||
* @param id - The ID of the batch scrape operation.
|
||||
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
|
||||
* @returns The response containing the job status.
|
||||
*/
|
||||
async checkBatchScrapeStatus(id?: string, getAllData = false): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
||||
if (!id) {
|
||||
throw new FirecrawlError("No batch scrape ID provided", 400);
|
||||
}
|
||||
|
||||
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.getRequest(
|
||||
`${this.apiUrl}/v1/batch/scrape/${id}`,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
let allData = response.data.data;
|
||||
if (getAllData && response.data.status === "completed") {
|
||||
let statusData = response.data
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while ('next' in statusData) {
|
||||
statusData = (await this.getRequest(statusData.next, headers)).data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
allData = data;
|
||||
}
|
||||
}
|
||||
return ({
|
||||
success: response.data.success,
|
||||
status: response.data.status,
|
||||
total: response.data.total,
|
||||
completed: response.data.completed,
|
||||
creditsUsed: response.data.creditsUsed,
|
||||
expiresAt: new Date(response.data.expiresAt),
|
||||
next: response.data.next,
|
||||
data: allData,
|
||||
error: response.data.error,
|
||||
})
|
||||
} else {
|
||||
this.handleError(response, "check batch scrape status");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares the headers for an API request.
|
||||
* @param idempotencyKey - Optional key to ensure idempotency.
|
||||
|
|
22
apps/js-sdk/package-lock.json
generated
22
apps/js-sdk/package-lock.json
generated
|
@ -9,7 +9,7 @@
|
|||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^1.0.3",
|
||||
"@mendable/firecrawl-js": "^1.7.0-beta.2",
|
||||
"axios": "^1.6.8",
|
||||
"firecrawl": "^1.2.0",
|
||||
"ts-node": "^10.9.2",
|
||||
|
@ -423,31 +423,17 @@
|
|||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js": {
|
||||
"version": "1.2.2",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz",
|
||||
"integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==",
|
||||
"version": "1.7.0-beta.2",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.7.0-beta.2.tgz",
|
||||
"integrity": "sha512-6L5r6BOuMPjLgSDq85xs2IpVgX9Tb/EdesKZvmtFucoaFZzIsgCQb0ZfSvwaRmqTkj53o+7eSgCcm+gsnR/yeQ==",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"isows": "^1.0.4",
|
||||
"typescript-event-target": "^1.1.1",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js/node_modules/uuid": {
|
||||
"version": "9.0.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
|
||||
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
"https://github.com/sponsors/ctavan"
|
||||
],
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/@tsconfig/node10": {
|
||||
"version": "1.0.11",
|
||||
"resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz",
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^1.0.3",
|
||||
"@mendable/firecrawl-js": "1.7.1",
|
||||
"axios": "^1.6.8",
|
||||
"firecrawl": "^1.2.0",
|
||||
"ts-node": "^10.9.2",
|
||||
|
|
|
@ -36,7 +36,6 @@ crawl_status = app.crawl_url(
|
|||
'limit': 100,
|
||||
'scrapeOptions': {'formats': ['markdown', 'html']}
|
||||
},
|
||||
wait_until_done=True,
|
||||
poll_interval=30
|
||||
)
|
||||
print(crawl_status)
|
||||
|
@ -150,6 +149,69 @@ async def start_crawl_and_watch():
|
|||
await start_crawl_and_watch()
|
||||
```
|
||||
|
||||
### Scraping multiple URLs in batch
|
||||
|
||||
To batch scrape multiple URLs, use the `batch_scrape_urls` method. It takes the URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper such as the output formats.
|
||||
|
||||
```python
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
batch_scrape_result = app.batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']}, 2, idempotency_key)
|
||||
print(batch_scrape_result)
|
||||
```
|
||||
|
||||
### Asynchronous batch scrape
|
||||
|
||||
To run a batch scrape asynchronously, use the `async_batch_scrape_urls` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
|
||||
|
||||
```python
|
||||
batch_scrape_result = app.async_batch_scrape_urls(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
|
||||
print(batch_scrape_result)
|
||||
```
|
||||
|
||||
### Checking batch scrape status
|
||||
|
||||
To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_status` method. It takes the job ID as a parameter and returns the current status of the batch scrape job.
|
||||
|
||||
```python
|
||||
id = batch_scrape_result['id']
|
||||
status = app.check_batch_scrape_status(id)
|
||||
```
|
||||
|
||||
### Batch scrape with WebSockets
|
||||
|
||||
To use batch scrape with WebSockets, use the `batch_scrape_urls_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the scraper, such as the output formats.
|
||||
|
||||
```python
|
||||
# inside an async function...
|
||||
nest_asyncio.apply()
|
||||
|
||||
# Define event handlers
|
||||
def on_document(detail):
|
||||
print("DOC", detail)
|
||||
|
||||
def on_error(detail):
|
||||
print("ERR", detail['error'])
|
||||
|
||||
def on_done(detail):
|
||||
print("DONE", detail['status'])
|
||||
|
||||
# Function to start the crawl and watch process
|
||||
async def start_crawl_and_watch():
|
||||
# Initiate the crawl job and get the watcher
|
||||
watcher = app.batch_scrape_urls_and_watch(['firecrawl.dev', 'mendable.ai'], {'formats': ['markdown', 'html']})
|
||||
|
||||
# Add event listeners
|
||||
watcher.add_event_listener("document", on_document)
|
||||
watcher.add_event_listener("error", on_error)
|
||||
watcher.add_event_listener("done", on_done)
|
||||
|
||||
# Start the watcher
|
||||
await watcher.connect()
|
||||
|
||||
# Run the event loop
|
||||
await start_crawl_and_watch()
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
||||
|
|
|
@ -9,6 +9,23 @@ app = FirecrawlApp(api_key="fc-")
|
|||
scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
|
||||
|
||||
# Test batch scrape
|
||||
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
||||
batch_scrape_params = {
|
||||
'formats': ['markdown', 'html'],
|
||||
}
|
||||
|
||||
# Synchronous batch scrape
|
||||
batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
|
||||
print("Synchronous Batch Scrape Result:")
|
||||
print(batch_result['data'][0]['markdown'])
|
||||
|
||||
# Asynchronous batch scrape
|
||||
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
|
||||
print("\nAsynchronous Batch Scrape Result:")
|
||||
print(async_batch_result)
|
||||
|
||||
# Crawl a website:
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
|
||||
|
|
|
@ -13,7 +13,7 @@ import os
|
|||
|
||||
from .firecrawl import FirecrawlApp
|
||||
|
||||
__version__ = "1.3.0"
|
||||
__version__ = "1.4.0"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
|
|
@ -81,8 +81,10 @@ class FirecrawlApp:
|
|||
response = response.json()
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
else:
|
||||
elif "error" in response:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response}')
|
||||
else:
|
||||
self._handle_error(response, 'scrape URL')
|
||||
|
||||
|
@ -117,7 +119,14 @@ class FirecrawlApp:
|
|||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Any: The crawl job ID or the crawl results if waiting until completion.
|
||||
Dict[str, Any]: A dictionary containing the crawl results. The structure includes:
|
||||
- 'success' (bool): Indicates if the crawl was successful.
|
||||
- 'status' (str): The final status of the crawl job (e.g., 'completed').
|
||||
- 'completed' (int): Number of scraped pages that completed.
|
||||
- 'total' (int): Total number of scraped pages.
|
||||
- 'creditsUsed' (int): Estimated number of API credits used for this crawl.
|
||||
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires.
|
||||
- 'data' (List[Dict]): List of all the scraped pages.
|
||||
|
||||
Raises:
|
||||
Exception: If the crawl job initiation or monitoring fails.
|
||||
|
@ -146,7 +155,10 @@ class FirecrawlApp:
|
|||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: The response from the crawl initiation request.
|
||||
Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes:
|
||||
- 'success' (bool): Indicates if the crawl initiation was successful.
|
||||
- 'id' (str): The unique identifier for the crawl job.
|
||||
- 'url' (str): The URL to check the status of the crawl job.
|
||||
"""
|
||||
endpoint = f'/v1/crawl'
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
|
@ -236,7 +248,7 @@ class FirecrawlApp:
|
|||
params (Optional[Dict[str, Any]]): Additional parameters for the map search.
|
||||
|
||||
Returns:
|
||||
Any: The result of the map search, typically a dictionary containing mapping data.
|
||||
List[str]: A list of URLs discovered during the map search.
|
||||
"""
|
||||
endpoint = f'/v1/map'
|
||||
headers = self._prepare_headers()
|
||||
|
@ -256,11 +268,130 @@ class FirecrawlApp:
|
|||
response = response.json()
|
||||
if response['success'] and 'links' in response:
|
||||
return response
|
||||
else:
|
||||
elif 'error' in response:
|
||||
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
||||
else:
|
||||
raise Exception(f'Failed to map URL. Error: {response}')
|
||||
else:
|
||||
self._handle_error(response, 'map')
|
||||
|
||||
def batch_scrape_urls(self, urls: list[str],
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
poll_interval: Optional[int] = 2,
|
||||
idempotency_key: Optional[str] = None) -> Any:
|
||||
"""
|
||||
Initiate a batch scrape job for the specified URLs using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
|
||||
- 'success' (bool): Indicates if the batch scrape was successful.
|
||||
- 'status' (str): The final status of the batch scrape job (e.g., 'completed').
|
||||
- 'completed' (int): Number of scraped pages that completed.
|
||||
- 'total' (int): Total number of scraped pages.
|
||||
- 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
|
||||
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
|
||||
- 'data' (List[Dict]): List of all the scraped pages.
|
||||
|
||||
Raises:
|
||||
Exception: If the batch scrape job initiation or monitoring fails.
|
||||
"""
|
||||
endpoint = f'/v1/batch/scrape'
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'urls': urls}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
id = response.json().get('id')
|
||||
return self._monitor_job_status(id, headers, poll_interval)
|
||||
|
||||
else:
|
||||
self._handle_error(response, 'start batch scrape job')
|
||||
|
||||
|
||||
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Initiate a crawl job asynchronously.
|
||||
|
||||
Args:
|
||||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
|
||||
- 'success' (bool): Indicates if the batch scrape initiation was successful.
|
||||
- 'id' (str): The unique identifier for the batch scrape job.
|
||||
- 'url' (str): The URL to check the status of the batch scrape job.
|
||||
"""
|
||||
endpoint = f'/v1/batch/scrape'
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'urls': urls}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, 'start batch scrape job')
|
||||
|
||||
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
|
||||
"""
|
||||
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
|
||||
|
||||
Args:
|
||||
urls (list[str]): The URLs to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
|
||||
"""
|
||||
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
|
||||
if crawl_response['success'] and 'id' in crawl_response:
|
||||
return CrawlWatcher(crawl_response['id'], self)
|
||||
else:
|
||||
raise Exception("Batch scrape job failed to start")
|
||||
|
||||
def check_batch_scrape_status(self, id: str) -> Any:
|
||||
"""
|
||||
Check the status of a batch scrape job using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
id (str): The ID of the batch scrape job.
|
||||
|
||||
Returns:
|
||||
Any: The status of the batch scrape job.
|
||||
|
||||
Raises:
|
||||
Exception: If the status check request fails.
|
||||
"""
|
||||
endpoint = f'/v1/batch/scrape/{id}'
|
||||
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return {
|
||||
'success': True,
|
||||
'status': data.get('status'),
|
||||
'total': data.get('total'),
|
||||
'completed': data.get('completed'),
|
||||
'creditsUsed': data.get('creditsUsed'),
|
||||
'expiresAt': data.get('expiresAt'),
|
||||
'next': data.get('next'),
|
||||
'data': data.get('data'),
|
||||
'error': data.get('error')
|
||||
}
|
||||
else:
|
||||
self._handle_error(response, 'check batch scrape status')
|
||||
|
||||
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
|
166
examples/claude_web_crawler/claude_web_crawler.py
Normal file
166
examples/claude_web_crawler/claude_web_crawler.py
Normal file
|
@ -0,0 +1,166 @@
|
|||
import os
|
||||
from firecrawl import FirecrawlApp
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import anthropic
|
||||
import agentops
|
||||
|
||||
# ANSI color codes
|
||||
class Colors:
|
||||
CYAN = '\033[96m'
|
||||
YELLOW = '\033[93m'
|
||||
GREEN = '\033[92m'
|
||||
RED = '\033[91m'
|
||||
MAGENTA = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
RESET = '\033[0m'
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Retrieve API keys from environment variables
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||
|
||||
# Initialize the FirecrawlApp and OpenAI client
|
||||
app = FirecrawlApp(api_key=firecrawl_api_key)
|
||||
client = anthropic.Anthropic(api_key=anthropic_api_key)
|
||||
|
||||
# Find the page that most likely contains the objective
|
||||
def find_relevant_page_via_map(objective, url, app, client):
|
||||
try:
|
||||
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
|
||||
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
|
||||
|
||||
map_prompt = f"""
|
||||
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
|
||||
"""
|
||||
|
||||
print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
|
||||
completion = client.messages.create(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
max_tokens=1000,
|
||||
temperature=0,
|
||||
system="You are an expert web crawler. Respond with the best search parameter.",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": map_prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
map_search_parameter = completion.content[0].text
|
||||
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
|
||||
|
||||
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
|
||||
map_website = app.map_url(url, params={"search": map_search_parameter})
|
||||
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
|
||||
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
|
||||
return map_website['links']
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
|
||||
def find_objective_in_top_pages(map_website, objective, app, client):
|
||||
try:
|
||||
# Get top 2 links from the map result
|
||||
top_links = map_website[:2]
|
||||
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
|
||||
|
||||
# Scrape the pages in batch
|
||||
batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})
|
||||
print(f"{Colors.GREEN}Batch page scraping completed successfully.{Colors.RESET}")
|
||||
|
||||
|
||||
for scrape_result in batch_scrape_result['data']:
|
||||
|
||||
# Check if objective is met
|
||||
check_prompt = f"""
|
||||
Given the following scraped content and objective, determine if the objective is met.
|
||||
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
|
||||
If the objective is not met with confidence, respond with 'Objective not met'.
|
||||
|
||||
Objective: {objective}
|
||||
Scraped content: {scrape_result['markdown']}
|
||||
|
||||
Remember:
|
||||
1. Only return JSON if you are confident the objective is fully met.
|
||||
2. Keep the JSON structure as simple and flat as possible.
|
||||
3. Do not include any explanations or markdown formatting in your response.
|
||||
"""
|
||||
|
||||
completion = client.messages.create(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
max_tokens=1000,
|
||||
temperature=0,
|
||||
system="You are an expert web crawler. Respond with the relevant information in JSON format.",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": check_prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
result = completion.content[0].text
|
||||
|
||||
if result != "Objective not met":
|
||||
print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
|
||||
try:
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
|
||||
|
||||
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
# Main function to execute the process
|
||||
def main():
|
||||
# Get user input
|
||||
url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
|
||||
if not url.strip():
|
||||
url = "https://www.firecrawl.dev/"
|
||||
|
||||
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
|
||||
if not objective.strip():
|
||||
objective = "find me the pricing plans"
|
||||
|
||||
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
|
||||
# Find the relevant page
|
||||
map_website = find_relevant_page_via_map(objective, url, app, client)
|
||||
print(map_website)
|
||||
|
||||
if map_website:
|
||||
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}")
|
||||
# Find objective in top pages
|
||||
result = find_objective_in_top_pages(map_website, objective, app, client)
|
||||
|
||||
if result:
|
||||
print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
agentops.init(os.getenv("AGENTOPS_API_KEY"))
|
||||
main()
|
150
examples/grok_web_crawler/grok_web_crawler.py
Normal file
150
examples/grok_web_crawler/grok_web_crawler.py
Normal file
|
@ -0,0 +1,150 @@
|
|||
import os
|
||||
from firecrawl import FirecrawlApp
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import requests
|
||||
|
||||
# ANSI color codes
|
||||
class Colors:
|
||||
CYAN = '\033[96m'
|
||||
YELLOW = '\033[93m'
|
||||
GREEN = '\033[92m'
|
||||
RED = '\033[91m'
|
||||
MAGENTA = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
RESET = '\033[0m'
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Retrieve API keys from environment variables
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
grok_api_key = os.getenv("GROK_API_KEY")
|
||||
|
||||
# Initialize the FirecrawlApp
|
||||
app = FirecrawlApp(api_key=firecrawl_api_key)
|
||||
|
||||
# Function to make Grok API calls
|
||||
def grok_completion(prompt):
|
||||
url = "https://api.x.ai/v1/chat/completions"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {grok_api_key}"
|
||||
}
|
||||
data = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}
|
||||
],
|
||||
"model": "grok-beta",
|
||||
"stream": False,
|
||||
"temperature": 0
|
||||
}
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
return response.json()['choices'][0]['message']['content']
|
||||
|
||||
# Find the page that most likely contains the objective
|
||||
def find_relevant_page_via_map(objective, url, app):
|
||||
try:
|
||||
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
|
||||
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
|
||||
|
||||
map_prompt = f"""
|
||||
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
|
||||
"""
|
||||
|
||||
print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
|
||||
map_search_parameter = grok_completion(map_prompt)
|
||||
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
|
||||
|
||||
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{map_search_parameter}{Colors.RESET}")
|
||||
map_website = app.map_url(url, params={"search": map_search_parameter})
|
||||
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
|
||||
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{map_website}{Colors.RESET}")
|
||||
return map_website["links"]
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
|
||||
def find_objective_in_top_pages(map_website, objective, app):
|
||||
try:
|
||||
print(f"{Colors.MAGENTA}{map_website}{Colors.RESET}")
|
||||
# Get top 3 links from the map result
|
||||
top_links = map_website[:3] if isinstance(map_website, list) else []
|
||||
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
|
||||
|
||||
for link in top_links:
|
||||
print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}")
|
||||
# Scrape the page
|
||||
scrape_result = app.scrape_url(link, params={'formats': ['markdown']})
|
||||
print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}")
|
||||
|
||||
|
||||
# Check if objective is met
|
||||
check_prompt = f"""
|
||||
Given the following scraped content and objective, determine if the objective is met.
|
||||
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
|
||||
If the objective is not met with confidence, respond with 'Objective not met'.
|
||||
|
||||
Objective: {objective}
|
||||
Scraped content: {scrape_result['markdown']}
|
||||
|
||||
Remember:
|
||||
1. Only return JSON if you are confident the objective is fully met.
|
||||
2. Keep the JSON structure as simple and flat as possible.
|
||||
3. Do not include any explanations or markdown formatting in your response.
|
||||
"""
|
||||
|
||||
result = grok_completion(check_prompt)
|
||||
print(f"{Colors.MAGENTA}{result}{Colors.RESET}")
|
||||
if result != "Objective not met":
|
||||
print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
|
||||
try:
|
||||
result = result.replace("```json", "").replace("```", "")
|
||||
return json.loads(result)
|
||||
except json.JSONDecodeError:
|
||||
print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
|
||||
|
||||
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
# Main function to execute the process
|
||||
def main():
|
||||
# Get user input
|
||||
url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
|
||||
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
|
||||
|
||||
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
|
||||
# Find the relevant page
|
||||
map_website = find_relevant_page_via_map(objective, url, app)
|
||||
|
||||
if map_website:
|
||||
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}")
|
||||
# Find objective in top pages
|
||||
result = find_objective_in_top_pages(map_website, objective, app)
|
||||
|
||||
if result:
|
||||
print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
2
examples/openai_swarm_firecrawl/.env.example
Normal file
2
examples/openai_swarm_firecrawl/.env.example
Normal file
|
@ -0,0 +1,2 @@
|
|||
OPENAI_API_KEY=
|
||||
FIRECRAWL_API_KEY=
|
37
examples/openai_swarm_firecrawl/README.md
Normal file
37
examples/openai_swarm_firecrawl/README.md
Normal file
|
@ -0,0 +1,37 @@
|
|||
# Swarm Firecrawl Marketing Agent
|
||||
|
||||
A multi-agent system using [OpenAI Swarm](https://github.com/openai/swarm) for AI-powered marketing strategies using [Firecrawl](https://firecrawl.dev) for web scraping.
|
||||
|
||||
## Agents
|
||||
|
||||
1. User Interface: Manages user interactions
|
||||
2. Website Scraper: Extracts clean LLM-ready content via Firecrawl API
|
||||
3. Analyst: Provides marketing insights
|
||||
4. Campaign Idea: Generates marketing campaign concepts
|
||||
5. Copywriter: Creates compelling marketing copy
|
||||
|
||||
## Requirements
|
||||
|
||||
- [Firecrawl](https://firecrawl.dev) API key
|
||||
- [OpenAI](https://platform.openai.com/api-keys) API key
|
||||
|
||||
## Setup
|
||||
|
||||
1. Install the required packages:
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
2. Set up your environment variables in a `.env` file:
|
||||
```
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
FIRECRAWL_API_KEY=your_firecrawl_api_key
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Run the main script to start the interactive demo:
|
||||
|
||||
```
|
||||
python main.py
|
||||
```
|
108
examples/openai_swarm_firecrawl/main.py
Normal file
108
examples/openai_swarm_firecrawl/main.py
Normal file
|
@ -0,0 +1,108 @@
|
|||
import os
|
||||
from firecrawl import FirecrawlApp
|
||||
from swarm import Agent
|
||||
from swarm.repl import run_demo_loop
|
||||
import dotenv
|
||||
from openai import OpenAI
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
# Initialize FirecrawlApp and OpenAI
|
||||
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
def scrape_website(url):
|
||||
"""Scrape a website using Firecrawl."""
|
||||
scrape_status = app.scrape_url(
|
||||
url,
|
||||
params={'formats': ['markdown']}
|
||||
)
|
||||
return scrape_status
|
||||
|
||||
def generate_completion(role, task, content):
|
||||
"""Generate a completion using OpenAI."""
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[
|
||||
{"role": "system", "content": f"You are a {role}. {task}"},
|
||||
{"role": "user", "content": content}
|
||||
]
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
def analyze_website_content(content):
|
||||
"""Analyze the scraped website content using OpenAI."""
|
||||
analysis = generate_completion(
|
||||
"marketing analyst",
|
||||
"Analyze the following website content and provide key insights for marketing strategy.",
|
||||
content
|
||||
)
|
||||
return {"analysis": analysis}
|
||||
|
||||
def generate_copy(brief):
|
||||
"""Generate marketing copy based on a brief using OpenAI."""
|
||||
copy = generate_completion(
|
||||
"copywriter",
|
||||
"Create compelling marketing copy based on the following brief.",
|
||||
brief
|
||||
)
|
||||
return {"copy": copy}
|
||||
|
||||
def create_campaign_idea(target_audience, goals):
|
||||
"""Create a campaign idea based on target audience and goals using OpenAI."""
|
||||
campaign_idea = generate_completion(
|
||||
"marketing strategist",
|
||||
"Create an innovative campaign idea based on the target audience and goals provided.",
|
||||
f"Target Audience: {target_audience}\nGoals: {goals}"
|
||||
)
|
||||
return {"campaign_idea": campaign_idea}
|
||||
|
||||
def handoff_to_copywriter():
|
||||
"""Hand off the campaign idea to the copywriter agent."""
|
||||
return copywriter_agent
|
||||
|
||||
def handoff_to_analyst():
|
||||
"""Hand off the website content to the analyst agent."""
|
||||
return analyst_agent
|
||||
|
||||
def handoff_to_campaign_idea():
|
||||
"""Hand off the target audience and goals to the campaign idea agent."""
|
||||
return campaign_idea_agent
|
||||
|
||||
def handoff_to_website_scraper():
|
||||
"""Hand off the url to the website scraper agent."""
|
||||
return website_scraper_agent
|
||||
|
||||
user_interface_agent = Agent(
|
||||
name="User Interface Agent",
|
||||
instructions="You are a user interface agent that handles all interactions with the user. You need to always start with a URL that the user wants to create a marketing strategy for. Ask clarification questions if needed. Be concise.",
|
||||
functions=[handoff_to_website_scraper],
|
||||
)
|
||||
|
||||
website_scraper_agent = Agent(
|
||||
name="Website Scraper Agent",
|
||||
instructions="You are a website scraper agent specialized in scraping website content.",
|
||||
functions=[scrape_website, handoff_to_analyst],
|
||||
)
|
||||
|
||||
analyst_agent = Agent(
|
||||
name="Analyst Agent",
|
||||
instructions="You are an analyst agent that examines website content and provides insights for marketing strategies. Be concise.",
|
||||
functions=[analyze_website_content, handoff_to_campaign_idea],
|
||||
)
|
||||
|
||||
campaign_idea_agent = Agent(
|
||||
name="Campaign Idea Agent",
|
||||
instructions="You are a campaign idea agent that creates innovative marketing campaign ideas based on website content and target audience. Be concise.",
|
||||
functions=[create_campaign_idea, handoff_to_copywriter],
|
||||
)
|
||||
|
||||
copywriter_agent = Agent(
|
||||
name="Copywriter Agent",
|
||||
instructions="You are a copywriter agent specialized in creating compelling marketing copy based on website content and campaign ideas. Be concise.",
|
||||
functions=[generate_copy],
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run the demo loop with the user interface agent
|
||||
run_demo_loop(user_interface_agent, stream=True)
|
2
examples/openai_swarm_firecrawl/requirements.txt
Normal file
2
examples/openai_swarm_firecrawl/requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
firecrawl-py
|
||||
openai
|
|
@ -0,0 +1,3 @@
|
|||
OPENAI_API_KEY=
|
||||
FIRECRAWL_API_KEY=
|
||||
SERP_API_KEY=
|
120
examples/openai_swarm_firecrawl_web_extractor/main.py
Normal file
120
examples/openai_swarm_firecrawl_web_extractor/main.py
Normal file
|
@ -0,0 +1,120 @@
|
|||
import os
|
||||
from firecrawl import FirecrawlApp
|
||||
from swarm import Agent
|
||||
from swarm.repl import run_demo_loop
|
||||
import dotenv
|
||||
from serpapi import GoogleSearch
|
||||
from openai import OpenAI
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
# Initialize FirecrawlApp and OpenAI
|
||||
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
def search_google(query, objective):
|
||||
"""Search Google using SerpAPI."""
|
||||
print(f"Parameters: query={query}, objective={objective}")
|
||||
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
|
||||
results = search.get_dict().get("organic_results", [])
|
||||
return {"objective": objective, "results": results}
|
||||
|
||||
def map_url_pages(url, objective):
|
||||
"""Map a website's pages using Firecrawl."""
|
||||
|
||||
search_query = generate_completion(
|
||||
"website search query generator",
|
||||
f"Generate a 1-2 word search query for the website: {url} based on the objective",
|
||||
"Objective: " + objective
|
||||
)
|
||||
print(f"Parameters: url={url}, objective={objective}, search_query={search_query}")
|
||||
map_status = app.map_url(url, params={'search': search_query})
|
||||
if map_status.get('status') == 'success':
|
||||
links = map_status.get('links', [])
|
||||
top_link = links[0] if links else None
|
||||
return {"objective": objective, "results": [top_link] if top_link else []}
|
||||
else:
|
||||
return {"objective": objective, "results": []}
|
||||
|
||||
def scrape_url(url, objective):
|
||||
"""Scrape a website using Firecrawl."""
|
||||
print(f"Parameters: url={url}, objective={objective}")
|
||||
scrape_status = app.scrape_url(
|
||||
url,
|
||||
params={'formats': ['markdown']}
|
||||
)
|
||||
return {"objective": objective, "results": scrape_status}
|
||||
|
||||
def analyze_website_content(content, objective):
|
||||
"""Analyze the scraped website content using OpenAI."""
|
||||
print(f"Parameters: content={content[:50]}..., objective={objective}")
|
||||
analysis = generate_completion(
|
||||
"website data extractor",
|
||||
f"Analyze the following website content and extract a JSON object based on the objective.",
|
||||
"Objective: " + objective + "\nContent: " + content
|
||||
)
|
||||
return {"objective": objective, "results": analysis}
|
||||
|
||||
def generate_completion(role, task, content):
|
||||
"""Generate a completion using OpenAI."""
|
||||
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[
|
||||
{"role": "system", "content": f"You are a {role}. {task}"},
|
||||
{"role": "user", "content": content}
|
||||
]
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
def handoff_to_search_google():
|
||||
"""Hand off the search query to the search google agent."""
|
||||
return google_search_agent
|
||||
|
||||
def handoff_to_map_url():
|
||||
"""Hand off the url to the map url agent."""
|
||||
return map_url_agent
|
||||
|
||||
def handoff_to_website_scraper():
|
||||
"""Hand off the url to the website scraper agent."""
|
||||
return website_scraper_agent
|
||||
|
||||
def handoff_to_analyst():
|
||||
"""Hand off the website content to the analyst agent."""
|
||||
return analyst_agent
|
||||
|
||||
|
||||
|
||||
user_interface_agent = Agent(
|
||||
name="User Interface Agent",
|
||||
instructions="You are a user interface agent that handles all interactions with the user. You need to always start with an web data extraction objective that the user wants to achieve by searching the web, mapping the web pages, and extracting the content from a specific page. Be concise.",
|
||||
functions=[handoff_to_search_google],
|
||||
)
|
||||
|
||||
google_search_agent = Agent(
|
||||
name="Google Search Agent",
|
||||
instructions="You are a google search agent specialized in searching the web. Only search for the website not any specific page. When you are done, you must hand off to the map agent.",
|
||||
functions=[search_google, handoff_to_map_url],
|
||||
)
|
||||
|
||||
map_url_agent = Agent(
|
||||
name="Map URL Agent",
|
||||
instructions="You are a map url agent specialized in mapping the web pages. When you are done, you must hand off the results to the website scraper agent.",
|
||||
functions=[map_url_pages, handoff_to_website_scraper],
|
||||
)
|
||||
|
||||
website_scraper_agent = Agent(
|
||||
name="Website Scraper Agent",
|
||||
instructions="You are a website scraper agent specialized in scraping website content. When you are done, you must hand off the website content to the analyst agent to extract the data based on the objective.",
|
||||
functions=[scrape_url, handoff_to_analyst],
|
||||
)
|
||||
|
||||
analyst_agent = Agent(
|
||||
name="Analyst Agent",
|
||||
instructions="You are an analyst agent that examines website content and returns a JSON object. When you are done, you must return a JSON object.",
|
||||
functions=[analyze_website_content],
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run the demo loop with the user interface agent
|
||||
run_demo_loop(user_interface_agent, stream=True)
|
|
@ -0,0 +1,4 @@
|
|||
firecrawl-py
|
||||
openai
|
||||
google-search-results
|
||||
git+https://github.com/openai/swarm.git
|
3
examples/sales_web_crawler/.env.example
Normal file
3
examples/sales_web_crawler/.env.example
Normal file
|
@ -0,0 +1,3 @@
|
|||
OPENAI_API_KEY=
|
||||
FIRECRAWL_API_KEY=
|
||||
SERP_API_KEY=
|
78
examples/sales_web_crawler/app.py
Normal file
78
examples/sales_web_crawler/app.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
import csv
|
||||
import json
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
from openai import OpenAI
|
||||
from serpapi import GoogleSearch
|
||||
from swarm import Agent
|
||||
from swarm.repl import run_demo_loop
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Initialize FirecrawlApp and OpenAI
|
||||
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
def crawl_and_analyze_url(url, objective):
|
||||
"""Crawl a website using Firecrawl and analyze the content."""
|
||||
print(f"Parameters: url={url}, objective={objective}")
|
||||
# Crawl the website
|
||||
crawl_status = app.crawl_url(
|
||||
url,
|
||||
params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}},
|
||||
poll_interval=5
|
||||
)
|
||||
crawl_status = crawl_status['data']
|
||||
# Process each 'markdown' element individually
|
||||
combined_results = []
|
||||
for item in crawl_status:
|
||||
if 'markdown' in item:
|
||||
content = item['markdown']
|
||||
# Analyze the content
|
||||
analysis = generate_completion(
|
||||
"website data extractor",
|
||||
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
|
||||
"Objective: " + objective + "\nContent: " + content
|
||||
)
|
||||
# Parse the JSON result
|
||||
try:
|
||||
result = json.loads(analysis)
|
||||
combined_results.append(result)
|
||||
except json.JSONDecodeError:
|
||||
print(f"Could not parse JSON from analysis: {analysis}")
|
||||
# Combine the results
|
||||
return {"objective": objective, "results": combined_results}
|
||||
|
||||
def generate_completion(role, task, content):
|
||||
"""Generate a completion using OpenAI."""
|
||||
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[
|
||||
{"role": "system", "content": f"You are a {role}. {task}"},
|
||||
{"role": "user", "content": content}
|
||||
]
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
def handoff_to_crawl_url():
|
||||
"""Hand off the url to the crawl url agent."""
|
||||
return crawl_website_agent
|
||||
|
||||
user_interface_agent = Agent(
|
||||
name="User Interface Agent",
|
||||
instructions="You are a user interface agent that handles all interactions with the user. You need to always start by asking for a URL to crawl and the web data extraction objective. Be concise.",
|
||||
functions=[handoff_to_crawl_url],
|
||||
)
|
||||
|
||||
crawl_website_agent = Agent(
|
||||
name="Crawl Website Agent",
|
||||
instructions="You are a crawl URL agent specialized in crawling web pages and analyzing their content. When you are done, you must print the results to the console.",
|
||||
functions=[crawl_and_analyze_url],
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run the demo loop with the user interface agent
|
||||
run_demo_loop(user_interface_agent, stream=True)
|
4
examples/sales_web_crawler/requirements.txt
Normal file
4
examples/sales_web_crawler/requirements.txt
Normal file
|
@ -0,0 +1,4 @@
|
|||
firecrawl-py
|
||||
openai
|
||||
google-search-results
|
||||
git+https://github.com/openai/swarm.git
|
|
@ -98,7 +98,7 @@
|
|||
"source": [
|
||||
"# Create a cache with a 5 minute TTL\n",
|
||||
"cache = caching.CachedContent.create(\n",
|
||||
" model=\"models/gemini-1.5-pro-001\",\n",
|
||||
" model=\"models/gemini-1.5-pro-002\",\n",
|
||||
" display_name=\"website crawl testing again\", # used to identify the cache\n",
|
||||
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
|
||||
" contents=[text_file],\n",
|
||||
|
|
|
@ -0,0 +1,166 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import datetime\n",
|
||||
"import time\n",
|
||||
"import google.generativeai as genai\n",
|
||||
"from google.generativeai import caching\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from firecrawl import FirecrawlApp\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# Load environment variables\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"# Retrieve API keys from environment variables\n",
|
||||
"google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
|
||||
"firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
|
||||
"\n",
|
||||
"# Configure the Google Generative AI module with the API key\n",
|
||||
"genai.configure(api_key=google_api_key)\n",
|
||||
"\n",
|
||||
"# Initialize the FirecrawlApp with your API key\n",
|
||||
"app = FirecrawlApp(api_key=firecrawl_api_key)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"No data returned from crawl.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Crawl a website\n",
|
||||
"crawl_url = 'https://dify.ai/'\n",
|
||||
"params = {\n",
|
||||
" \n",
|
||||
" 'crawlOptions': {\n",
|
||||
" 'limit': 100\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"crawl_result = app.crawl_url(crawl_url, params=params)\n",
|
||||
"\n",
|
||||
"if crawl_result is not None:\n",
|
||||
" # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
|
||||
" cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
|
||||
"\n",
|
||||
" # Save the modified results as a text file containing JSON data\n",
|
||||
" with open('crawl_result.txt', 'w') as file:\n",
|
||||
" file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
|
||||
"else:\n",
|
||||
" print(\"No data returned from crawl.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upload the video using the Files API\n",
|
||||
"text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
|
||||
"\n",
|
||||
"# Wait for the file to finish processing\n",
|
||||
"while text_file.state.name == \"PROCESSING\":\n",
|
||||
" print('Waiting for file to be processed.')\n",
|
||||
" time.sleep(2)\n",
|
||||
" text_file = genai.get_file(text_file.name)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a cache with a 5 minute TTL\n",
|
||||
"cache = caching.CachedContent.create(\n",
|
||||
" model=\"models/gemini-1.5-flash-002\",\n",
|
||||
" display_name=\"website crawl testing again\", # used to identify the cache\n",
|
||||
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
|
||||
" contents=[text_file],\n",
|
||||
" ttl=datetime.timedelta(minutes=15),\n",
|
||||
")\n",
|
||||
"# Construct a GenerativeModel which uses the created cache.\n",
|
||||
"model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
|
||||
"\n",
|
||||
"Here's how Firecrawl helps:\n",
|
||||
"\n",
|
||||
"* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
|
||||
"* **Clean Output:** Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
|
||||
"* **Parallel Crawling:** Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
|
||||
"\n",
|
||||
"You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
|
||||
"\n",
|
||||
"Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Query the model\n",
|
||||
"response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
|
||||
"response_dict = response.to_dict()\n",
|
||||
"response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
|
||||
"print(response_text)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in New Issue
Block a user