mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Merge branch 'main' into feat/go-html-to-md-parser
This commit is contained in:
commit
34adf43200
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
|
@ -28,6 +28,7 @@ env:
|
||||||
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
|
||||||
HDX_NODE_BETA_MODE: 1
|
HDX_NODE_BETA_MODE: 1
|
||||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||||
|
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||||
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
5
.github/workflows/fly.yml
vendored
5
.github/workflows/fly.yml
vendored
|
@ -28,6 +28,7 @@ env:
|
||||||
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||||
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
|
||||||
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
|
||||||
|
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pre-deploy-e2e-tests:
|
pre-deploy-e2e-tests:
|
||||||
|
@ -57,6 +58,9 @@ jobs:
|
||||||
run: npm run workers &
|
run: npm run workers &
|
||||||
working-directory: ./apps/api
|
working-directory: ./apps/api
|
||||||
id: start_workers
|
id: start_workers
|
||||||
|
- name: Wait for the application to be ready
|
||||||
|
run: |
|
||||||
|
sleep 10
|
||||||
- name: Run E2E tests
|
- name: Run E2E tests
|
||||||
run: |
|
run: |
|
||||||
npm run test:prod
|
npm run test:prod
|
||||||
|
@ -338,6 +342,7 @@ jobs:
|
||||||
build-and-publish-rust-sdk:
|
build-and-publish-rust-sdk:
|
||||||
name: Build and publish Rust SDK
|
name: Build and publish Rust SDK
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
needs: deploy
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
|
10
README.md
10
README.md
|
@ -14,10 +14,9 @@
|
||||||
<a href="https://GitHub.com/mendableai/firecrawl/graphs/contributors">
|
<a href="https://GitHub.com/mendableai/firecrawl/graphs/contributors">
|
||||||
<img src="https://img.shields.io/github/contributors/mendableai/firecrawl.svg" alt="GitHub Contributors">
|
<img src="https://img.shields.io/github/contributors/mendableai/firecrawl.svg" alt="GitHub Contributors">
|
||||||
</a>
|
</a>
|
||||||
<a href="https://github.com/mendableai/firecrawl">
|
<a href="https://firecrawl.dev">
|
||||||
<img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source">
|
<img src="https://img.shields.io/badge/Visit-firecrawl.dev-orange" alt="Visit firecrawl.dev">
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
<div>
|
<div>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
|
@ -391,7 +390,7 @@ With LLM extraction, you can easily extract structured data from any URL. We sup
|
||||||
|
|
||||||
from firecrawl.firecrawl import FirecrawlApp
|
from firecrawl.firecrawl import FirecrawlApp
|
||||||
|
|
||||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY", version="v0")
|
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||||
|
|
||||||
class ArticleSchema(BaseModel):
|
class ArticleSchema(BaseModel):
|
||||||
title: str
|
title: str
|
||||||
|
@ -466,8 +465,7 @@ import FirecrawlApp from "@mendable/firecrawl-js";
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
|
|
||||||
const app = new FirecrawlApp({
|
const app = new FirecrawlApp({
|
||||||
apiKey: "fc-YOUR_API_KEY",
|
apiKey: "fc-YOUR_API_KEY"
|
||||||
version: "v0"
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Define schema to extract contents into
|
// Define schema to extract contents into
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
import dotenv from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import {
|
import {
|
||||||
ScrapeRequest,
|
ScrapeRequest,
|
||||||
ScrapeResponseRequestTest,
|
ScrapeResponseRequestTest,
|
||||||
} from "../../controllers/v1/types";
|
} from "../../controllers/v1/types";
|
||||||
|
|
||||||
dotenv.config();
|
configDotenv();
|
||||||
const TEST_URL = "http://127.0.0.1:3002";
|
const TEST_URL = "http://127.0.0.1:3002";
|
||||||
|
|
||||||
describe("E2E Tests for v1 API Routes", () => {
|
describe("E2E Tests for v1 API Routes", () => {
|
||||||
|
@ -22,6 +22,13 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
|
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
|
||||||
"/is-production"
|
"/is-production"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION);
|
||||||
|
console.log('?', process.env.USE_DB_AUTHENTICATION === 'true');
|
||||||
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
console.log('!!useDbAuthentication', !!useDbAuthentication);
|
||||||
|
console.log('!useDbAuthentication', !useDbAuthentication);
|
||||||
|
|
||||||
expect(response.statusCode).toBe(200);
|
expect(response.statusCode).toBe(200);
|
||||||
expect(response.body).toHaveProperty("isProduction");
|
expect(response.body).toHaveProperty("isProduction");
|
||||||
});
|
});
|
||||||
|
@ -29,9 +36,10 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
|
|
||||||
describe("POST /v1/scrape", () => {
|
describe("POST /v1/scrape", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
"/v1/scrape"
|
.post("/v1/scrape")
|
||||||
);
|
.send({ url: "https://firecrawl.dev"})
|
||||||
|
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -389,7 +397,7 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
const scrapeRequest: ScrapeRequest = {
|
const scrapeRequest: ScrapeRequest = {
|
||||||
url: "https://ycombinator.com/companies",
|
url: "https://ycombinator.com/companies",
|
||||||
formats: ["markdown"],
|
formats: ["markdown"],
|
||||||
waitFor: 5000
|
waitFor: 8000
|
||||||
};
|
};
|
||||||
|
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
@ -451,9 +459,9 @@ describe("E2E Tests for v1 API Routes", () => {
|
||||||
|
|
||||||
describe("POST /v1/map", () => {
|
describe("POST /v1/map", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
"/v1/map"
|
.post("/v1/map")
|
||||||
);
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -534,7 +542,9 @@ describe("POST /v1/map", () => {
|
||||||
const links = response.body.links as unknown[];
|
const links = response.body.links as unknown[];
|
||||||
expect(Array.isArray(links)).toBe(true);
|
expect(Array.isArray(links)).toBe(true);
|
||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
|
||||||
|
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
|
||||||
|
expect(containsDocsFirecrawlDev).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
|
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
|
||||||
|
@ -559,7 +569,9 @@ describe("POST /v1/map", () => {
|
||||||
const links = response.body.links as unknown[];
|
const links = response.body.links as unknown[];
|
||||||
expect(Array.isArray(links)).toBe(true);
|
expect(Array.isArray(links)).toBe(true);
|
||||||
expect(links.length).toBeGreaterThan(0);
|
expect(links.length).toBeGreaterThan(0);
|
||||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
|
||||||
|
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
|
||||||
|
expect(containsDocsFirecrawlDev).toBe(true);
|
||||||
}, 10000)
|
}, 10000)
|
||||||
|
|
||||||
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
|
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
|
||||||
|
@ -609,9 +621,9 @@ describe("POST /v1/map", () => {
|
||||||
|
|
||||||
describe("POST /v1/crawl", () => {
|
describe("POST /v1/crawl", () => {
|
||||||
it.concurrent("should require authorization", async () => {
|
it.concurrent("should require authorization", async () => {
|
||||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
"/v1/crawl"
|
.post("/v1/crawl")
|
||||||
);
|
.send({ url: "https://firecrawl.dev" });
|
||||||
expect(response.statusCode).toBe(401);
|
expect(response.statusCode).toBe(401);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -863,7 +875,7 @@ describe("GET /v1/crawl/:jobId", () => {
|
||||||
.post("/v1/crawl")
|
.post("/v1/crawl")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://docs.mendable.ai" });
|
.send({ url: "https://docs.firecrawl.dev" });
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
let isCompleted = false;
|
let isCompleted = false;
|
||||||
|
@ -893,9 +905,7 @@ describe("GET /v1/crawl/:jobId", () => {
|
||||||
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
|
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||||
200
|
|
||||||
);
|
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.error
|
completedResponse.body.data[0].metadata.error
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
|
|
|
@ -659,7 +659,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
||||||
.post("/v0/crawl")
|
.post("/v0/crawl")
|
||||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
.set("Content-Type", "application/json")
|
.set("Content-Type", "application/json")
|
||||||
.send({ url: "https://mendable.ai/blog" });
|
.send({ url: "https://firecrawl.dev/blog" });
|
||||||
expect(crawlResponse.statusCode).toBe(200);
|
expect(crawlResponse.statusCode).toBe(200);
|
||||||
|
|
||||||
let isCompleted = false;
|
let isCompleted = false;
|
||||||
|
@ -689,10 +689,8 @@ describe("E2E Tests for v0 API Routes", () => {
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
expect(completedResponse.body.data[0]).toHaveProperty("content");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||||
expect(completedResponse.body.data[0].content).toContain("Mendable");
|
expect(completedResponse.body.data[0].content).toContain("Firecrawl");
|
||||||
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
|
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
|
||||||
200
|
|
||||||
);
|
|
||||||
expect(
|
expect(
|
||||||
completedResponse.body.data[0].metadata.pageError
|
completedResponse.body.data[0].metadata.pageError
|
||||||
).toBeUndefined();
|
).toBeUndefined();
|
||||||
|
@ -701,7 +699,7 @@ describe("E2E Tests for v0 API Routes", () => {
|
||||||
(doc) =>
|
(doc) =>
|
||||||
doc.metadata &&
|
doc.metadata &&
|
||||||
doc.metadata.sourceURL &&
|
doc.metadata.sourceURL &&
|
||||||
doc.metadata.sourceURL.includes("mendable.ai/blog")
|
doc.metadata.sourceURL.includes("firecrawl.dev/blog")
|
||||||
);
|
);
|
||||||
|
|
||||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||||
|
|
|
@ -5,6 +5,8 @@ import { supabase_service } from "../../../src/services/supabase";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { Logger } from "../../../src/lib/logger";
|
||||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export async function crawlCancelController(req: Request, res: Response) {
|
export async function crawlCancelController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -6,6 +6,8 @@ import { Logger } from "../../../src/lib/logger";
|
||||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||||
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
|
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export async function getJobs(ids: string[]) {
|
export async function getJobs(ids: string[]) {
|
||||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||||
|
|
|
@ -244,14 +244,10 @@ export async function scrapeController(req: Request, res: Response) {
|
||||||
}
|
}
|
||||||
if (creditsToBeBilled > 0) {
|
if (creditsToBeBilled > 0) {
|
||||||
// billing for doc done on queue end, bill only for llm extraction
|
// billing for doc done on queue end, bill only for llm extraction
|
||||||
const billingResult = await billTeam(team_id, creditsToBeBilled);
|
billTeam(team_id, creditsToBeBilled).catch(error => {
|
||||||
if (!billingResult.success) {
|
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||||
return res.status(402).json({
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
success: false,
|
});
|
||||||
error:
|
|
||||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -54,18 +54,10 @@ export async function searchHelper(
|
||||||
|
|
||||||
|
|
||||||
if (justSearch) {
|
if (justSearch) {
|
||||||
const billingResult = await billTeam(
|
billTeam(team_id, res.length).catch(error => {
|
||||||
team_id,
|
Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
|
||||||
res.length
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
);
|
});
|
||||||
if (!billingResult.success) {
|
|
||||||
return {
|
|
||||||
success: false,
|
|
||||||
error:
|
|
||||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
|
||||||
returnCode: 402,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
return { success: true, data: res, returnCode: 200 };
|
return { success: true, data: res, returnCode: 200 };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,8 @@ import { supabase_service } from "../../services/supabase";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export async function crawlCancelController(req: Request, res: Response) {
|
export async function crawlCancelController(req: Request, res: Response) {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -3,6 +3,8 @@ import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentCo
|
||||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
|
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export async function getJob(id: string) {
|
export async function getJob(id: string) {
|
||||||
const job = await getScrapeQueue().getJob(id);
|
const job = await getScrapeQueue().getJob(id);
|
||||||
|
|
|
@ -18,6 +18,7 @@ import { fireEngineMap } from "../../search/fireEngine";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||||
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
|
@ -61,8 +62,8 @@ export async function mapController(
|
||||||
: `site:${req.body.url}`;
|
: `site:${req.body.url}`;
|
||||||
// www. seems to exclude subdomains in some cases
|
// www. seems to exclude subdomains in some cases
|
||||||
const mapResults = await fireEngineMap(mapUrl, {
|
const mapResults = await fireEngineMap(mapUrl, {
|
||||||
// limit to 50 results (beta)
|
// limit to 100 results (beta)
|
||||||
numResults: Math.min(limit, 50),
|
numResults: Math.min(limit, 100),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (mapResults.length > 0) {
|
if (mapResults.length > 0) {
|
||||||
|
@ -100,7 +101,10 @@ export async function mapController(
|
||||||
// remove duplicates that could be due to http/https or www
|
// remove duplicates that could be due to http/https or www
|
||||||
links = removeDuplicateUrls(links);
|
links = removeDuplicateUrls(links);
|
||||||
|
|
||||||
await billTeam(req.auth.team_id, 1);
|
billTeam(req.auth.team_id, 1).catch(error => {
|
||||||
|
Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
|
||||||
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
|
});
|
||||||
|
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
@ -127,5 +131,6 @@ export async function mapController(
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
links: linksToReturn,
|
links: linksToReturn,
|
||||||
|
scrape_id: req.body.origin?.includes("website") ? id : undefined,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -106,14 +106,10 @@ export async function scrapeController(
|
||||||
creditsToBeBilled = 50;
|
creditsToBeBilled = 50;
|
||||||
}
|
}
|
||||||
|
|
||||||
const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled);
|
billTeam(req.auth.team_id, creditsToBeBilled).catch(error => {
|
||||||
if (!billingResult.success) {
|
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||||
return res.status(402).json({
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
success: false,
|
});
|
||||||
error:
|
|
||||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!pageOptions || !pageOptions.includeRawHtml) {
|
if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||||
if (doc && doc.rawHtml) {
|
if (doc && doc.rawHtml) {
|
||||||
|
@ -147,5 +143,6 @@ export async function scrapeController(
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: legacyDocumentConverter(doc),
|
data: legacyDocumentConverter(doc),
|
||||||
|
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -225,6 +225,7 @@ export type ScrapeResponse =
|
||||||
success: true;
|
success: true;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
data: Document;
|
data: Document;
|
||||||
|
scrape_id?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export interface ScrapeResponseRequestTest {
|
export interface ScrapeResponseRequestTest {
|
||||||
|
@ -246,6 +247,7 @@ export type MapResponse =
|
||||||
| {
|
| {
|
||||||
success: true;
|
success: true;
|
||||||
links: string[];
|
links: string[];
|
||||||
|
scrape_id?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type CrawlStatusParams = {
|
export type CrawlStatusParams = {
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
enum LogLevel {
|
enum LogLevel {
|
||||||
NONE = 'NONE', // No logs will be output.
|
NONE = 'NONE', // No logs will be output.
|
||||||
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
|
||||||
|
@ -25,7 +28,8 @@ export class Logger {
|
||||||
const color = Logger.colors[level];
|
const color = Logger.colors[level];
|
||||||
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
|
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
|
||||||
|
|
||||||
// if (process.env.USE_DB_AUTH) {
|
// const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
// if (useDbAuthentication) {
|
||||||
// save to supabase? another place?
|
// save to supabase? another place?
|
||||||
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
|
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
|
||||||
// }
|
// }
|
||||||
|
|
|
@ -2,6 +2,8 @@ import { Job } from "bullmq";
|
||||||
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
import type { baseScrapers } from "../scraper/WebScraper/single_url";
|
||||||
import { supabase_service as supabase } from "../services/supabase";
|
import { supabase_service as supabase } from "../services/supabase";
|
||||||
import { Logger } from "./logger";
|
import { Logger } from "./logger";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export type ScrapeErrorEvent = {
|
export type ScrapeErrorEvent = {
|
||||||
type: "error",
|
type: "error",
|
||||||
|
@ -36,7 +38,8 @@ export class ScrapeEvents {
|
||||||
static async insert(jobId: string, content: ScrapeEvent) {
|
static async insert(jobId: string, content: ScrapeEvent) {
|
||||||
if (jobId === "TEST") return null;
|
if (jobId === "TEST") return null;
|
||||||
|
|
||||||
if (process.env.USE_DB_AUTHENTICATION) {
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
if (useDbAuthentication) {
|
||||||
try {
|
try {
|
||||||
const result = await supabase.from("scrape_events").insert({
|
const result = await supabase.from("scrape_events").insert({
|
||||||
job_id: jobId,
|
job_id: jobId,
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
import { AuthResponse } from "../../src/types";
|
import { AuthResponse } from "../../src/types";
|
||||||
import { Logger } from "./logger";
|
import { Logger } from "./logger";
|
||||||
|
import * as Sentry from "@sentry/node";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
let warningCount = 0;
|
let warningCount = 0;
|
||||||
|
|
||||||
|
@ -7,7 +10,8 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
||||||
originalFunction: (...args: U) => Promise<T>
|
originalFunction: (...args: U) => Promise<T>
|
||||||
) {
|
) {
|
||||||
return async function (...args: U): Promise<T> {
|
return async function (...args: U): Promise<T> {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
if (!useDbAuthentication) {
|
||||||
if (warningCount < 5) {
|
if (warningCount < 5) {
|
||||||
Logger.warn("You're bypassing authentication");
|
Logger.warn("You're bypassing authentication");
|
||||||
warningCount++;
|
warningCount++;
|
||||||
|
@ -17,6 +21,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
|
||||||
try {
|
try {
|
||||||
return await originalFunction(...args);
|
return await originalFunction(...args);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
Sentry.captureException(error);
|
||||||
Logger.error(`Error in withAuth function: ${error}`);
|
Logger.error(`Error in withAuth function: ${error}`);
|
||||||
return { success: false, error: error.message } as T;
|
return { success: false, error: error.message } as T;
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,6 +12,8 @@ import { Document } from "../lib/entities";
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
import { ScrapeEvents } from "../lib/scrape-events";
|
import { ScrapeEvents } from "../lib/scrape-events";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export async function startWebScraperPipeline({
|
export async function startWebScraperPipeline({
|
||||||
job,
|
job,
|
||||||
|
@ -118,15 +120,10 @@ export async function runWebScraper({
|
||||||
: docs;
|
: docs;
|
||||||
|
|
||||||
if(is_scrape === false) {
|
if(is_scrape === false) {
|
||||||
const billingResult = await billTeam(team_id, filteredDocs.length);
|
billTeam(team_id, filteredDocs.length).catch(error => {
|
||||||
if (!billingResult.success) {
|
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
|
||||||
// throw new Error("Failed to bill team, no subscription was found");
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
return {
|
});
|
||||||
success: false,
|
|
||||||
message: "Failed to bill team, no subscription was found",
|
|
||||||
docs: [],
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -144,7 +141,8 @@ export async function runWebScraper({
|
||||||
|
|
||||||
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
|
||||||
try {
|
try {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
if (useDbAuthentication) {
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("firecrawl_jobs")
|
.from("firecrawl_jobs")
|
||||||
.update({ docs: result })
|
.update({ docs: result })
|
||||||
|
|
|
@ -33,7 +33,9 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
||||||
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
if (!res.headersSent) {
|
||||||
|
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
req.account = { remainingCredits }
|
req.account = { remainingCredits }
|
||||||
next();
|
next();
|
||||||
|
@ -52,7 +54,9 @@ export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestW
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
return res.status(status).json({ success: false, error });
|
if (!res.headersSent) {
|
||||||
|
return res.status(status).json({ success: false, error });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
req.auth = { team_id, plan };
|
req.auth = { team_id, plan };
|
||||||
|
@ -67,7 +71,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
|
||||||
if (req.headers["x-idempotency-key"]) {
|
if (req.headers["x-idempotency-key"]) {
|
||||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||||
if (!isIdempotencyValid) {
|
if (!isIdempotencyValid) {
|
||||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
if (!res.headersSent) {
|
||||||
|
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
createIdempotencyKey(req);
|
createIdempotencyKey(req);
|
||||||
}
|
}
|
||||||
|
@ -78,7 +84,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
|
||||||
|
|
||||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||||
if (req.body.url && isUrlBlocked(req.body.url)) {
|
if (req.body.url && isUrlBlocked(req.body.url)) {
|
||||||
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
if (!res.headersSent) {
|
||||||
|
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
next();
|
next();
|
||||||
}
|
}
|
||||||
|
@ -96,26 +104,26 @@ export const v1Router = express.Router();
|
||||||
|
|
||||||
v1Router.post(
|
v1Router.post(
|
||||||
"/scrape",
|
"/scrape",
|
||||||
blocklistMiddleware,
|
|
||||||
authMiddleware(RateLimiterMode.Scrape),
|
authMiddleware(RateLimiterMode.Scrape),
|
||||||
checkCreditsMiddleware(1),
|
checkCreditsMiddleware(1),
|
||||||
|
blocklistMiddleware,
|
||||||
wrap(scrapeController)
|
wrap(scrapeController)
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.post(
|
v1Router.post(
|
||||||
"/crawl",
|
"/crawl",
|
||||||
blocklistMiddleware,
|
|
||||||
authMiddleware(RateLimiterMode.Crawl),
|
authMiddleware(RateLimiterMode.Crawl),
|
||||||
idempotencyMiddleware,
|
|
||||||
checkCreditsMiddleware(),
|
checkCreditsMiddleware(),
|
||||||
|
blocklistMiddleware,
|
||||||
|
idempotencyMiddleware,
|
||||||
wrap(crawlController)
|
wrap(crawlController)
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.post(
|
v1Router.post(
|
||||||
"/map",
|
"/map",
|
||||||
blocklistMiddleware,
|
|
||||||
authMiddleware(RateLimiterMode.Map),
|
authMiddleware(RateLimiterMode.Map),
|
||||||
checkCreditsMiddleware(1),
|
checkCreditsMiddleware(1),
|
||||||
|
blocklistMiddleware,
|
||||||
wrap(mapController)
|
wrap(mapController)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
|
@ -23,12 +23,15 @@ import { clientSideError } from "../../strings";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
|
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
|
||||||
|
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||||
|
|
||||||
export const baseScrapers = [
|
export const baseScrapers = [
|
||||||
"fire-engine;chrome-cdp",
|
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||||
"fire-engine",
|
useFireEngine ? "fire-engine" : undefined,
|
||||||
"scrapingBee",
|
useScrapingBee ? "scrapingBee" : undefined,
|
||||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
useFireEngine ? undefined : "playwright",
|
||||||
"scrapingBeeLoad",
|
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||||
"fetch",
|
"fetch",
|
||||||
].filter(Boolean);
|
].filter(Boolean);
|
||||||
|
|
||||||
|
@ -85,18 +88,18 @@ function getScrapingFallbackOrder(
|
||||||
});
|
});
|
||||||
|
|
||||||
let defaultOrder = [
|
let defaultOrder = [
|
||||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
|
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
|
||||||
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
|
useFireEngine ? "fire-engine" : undefined,
|
||||||
"scrapingBee",
|
useScrapingBee ? "scrapingBee" : undefined,
|
||||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
useScrapingBee ? "scrapingBeeLoad" : undefined,
|
||||||
"scrapingBeeLoad",
|
useFireEngine ? undefined : "playwright",
|
||||||
"fetch",
|
"fetch",
|
||||||
].filter(Boolean);
|
].filter(Boolean);
|
||||||
|
|
||||||
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
|
||||||
defaultOrder = [
|
defaultOrder = [
|
||||||
"fire-engine",
|
"fire-engine",
|
||||||
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
|
useFireEngine ? undefined : "playwright",
|
||||||
...defaultOrder.filter(
|
...defaultOrder.filter(
|
||||||
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
|
||||||
),
|
),
|
||||||
|
|
|
@ -5,7 +5,7 @@ import { supabase_service } from "../supabase";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
import { getValue, setValue } from "../redis";
|
import { getValue, setValue } from "../redis";
|
||||||
import { redlock } from "../redlock";
|
import { redlock } from "../redlock";
|
||||||
|
import * as Sentry from "@sentry/node";
|
||||||
|
|
||||||
const FREE_CREDITS = 500;
|
const FREE_CREDITS = 500;
|
||||||
|
|
||||||
|
@ -40,14 +40,15 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let couponCredits = 0;
|
let couponCredits = 0;
|
||||||
|
let sortedCoupons = [];
|
||||||
|
|
||||||
if (coupons && coupons.length > 0) {
|
if (coupons && coupons.length > 0) {
|
||||||
couponCredits = coupons.reduce(
|
couponCredits = coupons.reduce(
|
||||||
(total, coupon) => total + coupon.credits,
|
(total, coupon) => total + coupon.credits,
|
||||||
0
|
0
|
||||||
);
|
);
|
||||||
|
sortedCoupons = [...coupons].sort((a, b) => b.credits - a.credits);
|
||||||
}
|
}
|
||||||
|
|
||||||
let sortedCoupons = coupons.sort((a, b) => b.credits - a.credits);
|
|
||||||
// using coupon credits:
|
// using coupon credits:
|
||||||
if (couponCredits > 0) {
|
if (couponCredits > 0) {
|
||||||
// if there is no subscription and they have enough coupon credits
|
// if there is no subscription and they have enough coupon credits
|
||||||
|
@ -175,9 +176,24 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||||
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
|
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retrieve the team's active subscription and check for available coupons concurrently
|
|
||||||
const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
|
let cacheKeySubscription = `subscription_${team_id}`;
|
||||||
await Promise.all([
|
let cacheKeyCoupons = `coupons_${team_id}`;
|
||||||
|
|
||||||
|
// Try to get data from cache first
|
||||||
|
const [cachedSubscription, cachedCoupons] = await Promise.all([
|
||||||
|
getValue(cacheKeySubscription),
|
||||||
|
getValue(cacheKeyCoupons)
|
||||||
|
]);
|
||||||
|
|
||||||
|
let subscription, subscriptionError, coupons;
|
||||||
|
|
||||||
|
if (cachedSubscription && cachedCoupons) {
|
||||||
|
subscription = JSON.parse(cachedSubscription);
|
||||||
|
coupons = JSON.parse(cachedCoupons);
|
||||||
|
} else {
|
||||||
|
// If not in cache, retrieve from database
|
||||||
|
const [subscriptionResult, couponsResult] = await Promise.all([
|
||||||
supabase_service
|
supabase_service
|
||||||
.from("subscriptions")
|
.from("subscriptions")
|
||||||
.select("id, price_id, current_period_start, current_period_end")
|
.select("id, price_id, current_period_start, current_period_end")
|
||||||
|
@ -191,6 +207,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||||
.eq("status", "active"),
|
.eq("status", "active"),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
subscription = subscriptionResult.data;
|
||||||
|
subscriptionError = subscriptionResult.error;
|
||||||
|
coupons = couponsResult.data;
|
||||||
|
|
||||||
|
// Cache the results for a minute, sub can be null and that's fine
|
||||||
|
await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null
|
||||||
|
await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
let couponCredits = 0;
|
let couponCredits = 0;
|
||||||
if (coupons && coupons.length > 0) {
|
if (coupons && coupons.length > 0) {
|
||||||
couponCredits = coupons.reduce(
|
couponCredits = coupons.reduce(
|
||||||
|
@ -211,41 +237,54 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||||
|
|
||||||
let creditUsages;
|
let creditUsages;
|
||||||
let creditUsageError;
|
let creditUsageError;
|
||||||
let retries = 0;
|
let totalCreditsUsed = 0;
|
||||||
const maxRetries = 3;
|
const cacheKeyCreditUsage = `credit_usage_${team_id}`;
|
||||||
const retryInterval = 2000; // 2 seconds
|
|
||||||
|
|
||||||
while (retries < maxRetries) {
|
// Try to get credit usage from cache
|
||||||
const result = await supabase_service
|
const cachedCreditUsage = await getValue(cacheKeyCreditUsage);
|
||||||
.from("credit_usage")
|
|
||||||
.select("credits_used")
|
|
||||||
.is("subscription_id", null)
|
|
||||||
.eq("team_id", team_id);
|
|
||||||
|
|
||||||
creditUsages = result.data;
|
if (cachedCreditUsage) {
|
||||||
creditUsageError = result.error;
|
totalCreditsUsed = parseInt(cachedCreditUsage);
|
||||||
|
} else {
|
||||||
|
let retries = 0;
|
||||||
|
const maxRetries = 3;
|
||||||
|
const retryInterval = 2000; // 2 seconds
|
||||||
|
|
||||||
if (!creditUsageError) {
|
while (retries < maxRetries) {
|
||||||
break;
|
const result = await supabase_service
|
||||||
|
.from("credit_usage")
|
||||||
|
.select("credits_used")
|
||||||
|
.is("subscription_id", null)
|
||||||
|
.eq("team_id", team_id);
|
||||||
|
|
||||||
|
creditUsages = result.data;
|
||||||
|
creditUsageError = result.error;
|
||||||
|
|
||||||
|
if (!creditUsageError) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
retries++;
|
||||||
|
if (retries < maxRetries) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, retryInterval));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
retries++;
|
if (creditUsageError) {
|
||||||
if (retries < maxRetries) {
|
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
|
||||||
await new Promise(resolve => setTimeout(resolve, retryInterval));
|
throw new Error(
|
||||||
|
`Failed to retrieve credit usage for team_id: ${team_id}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (creditUsageError) {
|
totalCreditsUsed = creditUsages.reduce(
|
||||||
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
|
(acc, usage) => acc + usage.credits_used,
|
||||||
throw new Error(
|
0
|
||||||
`Failed to retrieve credit usage for team_id: ${team_id}`
|
|
||||||
);
|
);
|
||||||
}
|
|
||||||
|
|
||||||
const totalCreditsUsed = creditUsages.reduce(
|
// Cache the result for 30 seconds
|
||||||
(acc, usage) => acc + usage.credits_used,
|
await setValue(cacheKeyCreditUsage, totalCreditsUsed.toString(), 30);
|
||||||
0
|
}
|
||||||
);
|
|
||||||
|
|
||||||
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
|
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
|
||||||
|
|
||||||
|
@ -255,7 +294,9 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||||
const creditLimit = FREE_CREDITS;
|
const creditLimit = FREE_CREDITS;
|
||||||
const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit;
|
const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit;
|
||||||
|
|
||||||
if (creditUsagePercentage >= 0.8) {
|
// Add a check to ensure totalCreditsUsed is greater than 0
|
||||||
|
if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
||||||
|
Logger.info(`Sending notification for team ${team_id}. Total credits used: ${totalCreditsUsed}, Credit usage percentage: ${creditUsagePercentage}`);
|
||||||
await sendNotification(
|
await sendNotification(
|
||||||
team_id,
|
team_id,
|
||||||
NotificationType.APPROACHING_LIMIT,
|
NotificationType.APPROACHING_LIMIT,
|
||||||
|
@ -309,7 +350,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||||
|
|
||||||
if (creditUsages && creditUsages.length > 0) {
|
if (creditUsages && creditUsages.length > 0) {
|
||||||
totalCreditsUsed = creditUsages[0].total_credits_used;
|
totalCreditsUsed = creditUsages[0].total_credits_used;
|
||||||
await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes
|
await setValue(cacheKey, totalCreditsUsed.toString(), 500); // Cache for 8 minutes
|
||||||
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
|
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -322,17 +363,38 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||||
|
|
||||||
// Adjust total credits used by subtracting coupon value
|
// Adjust total credits used by subtracting coupon value
|
||||||
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
|
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
|
||||||
// Get the price details
|
|
||||||
const { data: price, error: priceError } = await supabase_service
|
|
||||||
.from("prices")
|
|
||||||
.select("credits")
|
|
||||||
.eq("id", subscription.price_id)
|
|
||||||
.single();
|
|
||||||
|
|
||||||
if (priceError) {
|
// Get the price details from cache or database
|
||||||
throw new Error(
|
const priceCacheKey = `price_${subscription.price_id}`;
|
||||||
`Failed to retrieve price for price_id: ${subscription.price_id}`
|
let price;
|
||||||
);
|
|
||||||
|
try {
|
||||||
|
const cachedPrice = await getValue(priceCacheKey);
|
||||||
|
if (cachedPrice) {
|
||||||
|
price = JSON.parse(cachedPrice);
|
||||||
|
} else {
|
||||||
|
const { data, error: priceError } = await supabase_service
|
||||||
|
.from("prices")
|
||||||
|
.select("credits")
|
||||||
|
.eq("id", subscription.price_id)
|
||||||
|
.single();
|
||||||
|
|
||||||
|
if (priceError) {
|
||||||
|
throw new Error(
|
||||||
|
`Failed to retrieve price for price_id: ${subscription.price_id}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
price = data;
|
||||||
|
// There are only 21 records, so this is super fine
|
||||||
|
// Cache the price for a long time (e.g., 1 day)
|
||||||
|
await setValue(priceCacheKey, JSON.stringify(price), 86400);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Error retrieving or caching price: ${error}`);
|
||||||
|
Sentry.captureException(error);
|
||||||
|
// If errors, just assume it's a big number so user don't get an error
|
||||||
|
price = { credits: 1000000 };
|
||||||
}
|
}
|
||||||
|
|
||||||
const creditLimit = price.credits;
|
const creditLimit = price.credits;
|
||||||
|
@ -462,8 +524,8 @@ async function createCreditUsage({
|
||||||
subscription_id?: string;
|
subscription_id?: string;
|
||||||
credits: number;
|
credits: number;
|
||||||
}) {
|
}) {
|
||||||
const { data: credit_usage } = await supabase_service
|
await supabase_service
|
||||||
.from("credit_usage")
|
.from("credit_usage")
|
||||||
.insert([
|
.insert([
|
||||||
{
|
{
|
||||||
team_id,
|
team_id,
|
||||||
|
@ -471,8 +533,7 @@ async function createCreditUsage({
|
||||||
subscription_id: subscription_id || null,
|
subscription_id: subscription_id || null,
|
||||||
created_at: new Date(),
|
created_at: new Date(),
|
||||||
},
|
},
|
||||||
])
|
]);
|
||||||
.select();
|
|
||||||
|
|
||||||
return { success: true, credit_usage };
|
return { success: true };
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
import { supabase_service } from "../supabase";
|
import { supabase_service } from "../supabase";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { Logger } from "../../../src/lib/logger";
|
||||||
import "dotenv/config";
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export async function logCrawl(job_id: string, team_id: string) {
|
export async function logCrawl(job_id: string, team_id: string) {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === 'true') {
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
if (useDbAuthentication) {
|
||||||
try {
|
try {
|
||||||
const { data, error } = await supabase_service
|
const { data, error } = await supabase_service
|
||||||
.from("bulljobs_teams")
|
.from("bulljobs_teams")
|
||||||
|
|
|
@ -4,10 +4,13 @@ import { FirecrawlJob } from "../../types";
|
||||||
import { posthog } from "../posthog";
|
import { posthog } from "../posthog";
|
||||||
import "dotenv/config";
|
import "dotenv/config";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export async function logJob(job: FirecrawlJob) {
|
export async function logJob(job: FirecrawlJob) {
|
||||||
try {
|
try {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
if (!useDbAuthentication) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,12 +3,15 @@ import { ScrapeLog } from "../../types";
|
||||||
import { supabase_service } from "../supabase";
|
import { supabase_service } from "../supabase";
|
||||||
import { PageOptions } from "../../lib/entities";
|
import { PageOptions } from "../../lib/entities";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export async function logScrape(
|
export async function logScrape(
|
||||||
scrapeLog: ScrapeLog,
|
scrapeLog: ScrapeLog,
|
||||||
pageOptions?: PageOptions
|
pageOptions?: PageOptions
|
||||||
) {
|
) {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
if (!useDbAuthentication) {
|
||||||
Logger.debug("Skipping logging scrape to Supabase");
|
Logger.debug("Skipping logging scrape to Supabase");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,6 +67,6 @@ export function waitForJob(jobId: string, timeout: number) {
|
||||||
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}, 1000);
|
}, 500);
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,6 +36,8 @@ import {
|
||||||
} from "../../src/lib/job-priority";
|
} from "../../src/lib/job-priority";
|
||||||
import { PlanType } from "../types";
|
import { PlanType } from "../types";
|
||||||
import { getJobs } from "../../src/controllers/v1/crawl-status";
|
import { getJobs } from "../../src/controllers/v1/crawl-status";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
if (process.env.ENV === "production") {
|
if (process.env.ENV === "production") {
|
||||||
initSDK({
|
initSDK({
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||||
import { Logger } from "../lib/logger";
|
import { Logger } from "../lib/logger";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||||
class SupabaseService {
|
class SupabaseService {
|
||||||
|
@ -8,8 +10,9 @@ class SupabaseService {
|
||||||
constructor() {
|
constructor() {
|
||||||
const supabaseUrl = process.env.SUPABASE_URL;
|
const supabaseUrl = process.env.SUPABASE_URL;
|
||||||
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
|
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
|
||||||
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
// Only initialize the Supabase client if both URL and Service Token are provided.
|
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
if (!useDbAuthentication) {
|
||||||
// Warn the user that Authentication is disabled by setting the client to null
|
// Warn the user that Authentication is disabled by setting the client to null
|
||||||
Logger.warn(
|
Logger.warn(
|
||||||
"Authentication is disabled. Supabase client will not be initialized."
|
"Authentication is disabled. Supabase client will not be initialized."
|
||||||
|
|
|
@ -3,6 +3,8 @@ import { legacyDocumentConverter } from "../../src/controllers/v1/types";
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { supabase_service } from "./supabase";
|
import { supabase_service } from "./supabase";
|
||||||
import { WebhookEventType } from "../types";
|
import { WebhookEventType } from "../types";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export const callWebhook = async (
|
export const callWebhook = async (
|
||||||
teamId: string,
|
teamId: string,
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.2.1",
|
"version": "1.2.2",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "build/cjs/index.js",
|
"main": "build/cjs/index.js",
|
||||||
"types": "types/index.d.ts",
|
"types": "types/index.d.ts",
|
||||||
|
|
|
@ -454,20 +454,27 @@ export default class FirecrawlApp {
|
||||||
checkInterval: number
|
checkInterval: number
|
||||||
): Promise<CrawlStatusResponse> {
|
): Promise<CrawlStatusResponse> {
|
||||||
while (true) {
|
while (true) {
|
||||||
const statusResponse: AxiosResponse = await this.getRequest(
|
let statusResponse: AxiosResponse = await this.getRequest(
|
||||||
`${this.apiUrl}/v1/crawl/${id}`,
|
`${this.apiUrl}/v1/crawl/${id}`,
|
||||||
headers
|
headers
|
||||||
);
|
);
|
||||||
if (statusResponse.status === 200) {
|
if (statusResponse.status === 200) {
|
||||||
const statusData = statusResponse.data;
|
let statusData = statusResponse.data;
|
||||||
if (statusData.status === "completed") {
|
if (statusData.status === "completed") {
|
||||||
if ("data" in statusData) {
|
if ("data" in statusData) {
|
||||||
|
let data = statusData.data;
|
||||||
|
while ('next' in statusData) {
|
||||||
|
statusResponse = await this.getRequest(statusData.next, headers);
|
||||||
|
statusData = statusResponse.data;
|
||||||
|
data = data.concat(statusData.data);
|
||||||
|
}
|
||||||
|
statusData.data = data;
|
||||||
return statusData;
|
return statusData;
|
||||||
} else {
|
} else {
|
||||||
throw new Error("Crawl job completed but no data was returned");
|
throw new Error("Crawl job completed but no data was returned");
|
||||||
}
|
}
|
||||||
} else if (
|
} else if (
|
||||||
["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
|
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
||||||
) {
|
) {
|
||||||
checkInterval = Math.max(checkInterval, 2);
|
checkInterval = Math.max(checkInterval, 2);
|
||||||
await new Promise((resolve) =>
|
await new Promise((resolve) =>
|
||||||
|
|
|
@ -13,7 +13,7 @@ import os
|
||||||
|
|
||||||
from .firecrawl import FirecrawlApp
|
from .firecrawl import FirecrawlApp
|
||||||
|
|
||||||
__version__ = "1.2.1"
|
__version__ = "1.2.3"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
|
@ -238,7 +238,6 @@ class FirecrawlApp:
|
||||||
)
|
)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response = response.json()
|
response = response.json()
|
||||||
print(response)
|
|
||||||
if response['success'] and 'links' in response:
|
if response['success'] and 'links' in response:
|
||||||
return response['links']
|
return response['links']
|
||||||
else:
|
else:
|
||||||
|
@ -346,6 +345,12 @@ class FirecrawlApp:
|
||||||
status_data = status_response.json()
|
status_data = status_response.json()
|
||||||
if status_data['status'] == 'completed':
|
if status_data['status'] == 'completed':
|
||||||
if 'data' in status_data:
|
if 'data' in status_data:
|
||||||
|
data = status_data['data']
|
||||||
|
while 'next' in status_data:
|
||||||
|
status_response = self._get_request(status_data['next'], headers)
|
||||||
|
status_data = status_response.json()
|
||||||
|
data.extend(status_data['data'])
|
||||||
|
status_data['data'] = data
|
||||||
return status_data
|
return status_data
|
||||||
else:
|
else:
|
||||||
raise Exception('Crawl job completed but no data was returned')
|
raise Exception('Crawl job completed but no data was returned')
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
import { createClient, SupabaseClient } from "@supabase/supabase-js";
|
||||||
import "dotenv/config";
|
import { configDotenv } from "dotenv";
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
|
||||||
class SupabaseService {
|
class SupabaseService {
|
||||||
|
@ -9,7 +10,8 @@ class SupabaseService {
|
||||||
const supabaseUrl = process.env.SUPABASE_URL;
|
const supabaseUrl = process.env.SUPABASE_URL;
|
||||||
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
|
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
|
||||||
// Only initialize the Supabase client if both URL and Service Token are provided.
|
// Only initialize the Supabase client if both URL and Service Token are provided.
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
if (!useDbAuthentication) {
|
||||||
// Warn the user that Authentication is disabled by setting the client to null
|
// Warn the user that Authentication is disabled by setting the client to null
|
||||||
console.warn(
|
console.warn(
|
||||||
"Authentication is disabled. Supabase client will not be initialized."
|
"Authentication is disabled. Supabase client will not be initialized."
|
||||||
|
@ -36,7 +38,8 @@ export const supabase_service: SupabaseClient = new Proxy(
|
||||||
new SupabaseService(),
|
new SupabaseService(),
|
||||||
{
|
{
|
||||||
get: function (target, prop, receiver) {
|
get: function (target, prop, receiver) {
|
||||||
if (process.env.USE_DB_AUTHENTICATION === "false") {
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
|
if (!useDbAuthentication) {
|
||||||
console.debug(
|
console.debug(
|
||||||
"Attempted to access Supabase client when it's not configured."
|
"Attempted to access Supabase client when it's not configured."
|
||||||
);
|
);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user