Merge branch 'main' into feat/go-html-to-md-parser

This commit is contained in:
Nicolas 2024-09-05 12:13:14 -03:00
commit 34adf43200
33 changed files with 281 additions and 157 deletions

View File

@ -28,6 +28,7 @@ env:
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
jobs:

View File

@ -28,6 +28,7 @@ env:
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
jobs:
pre-deploy-e2e-tests:
@ -57,6 +58,9 @@ jobs:
run: npm run workers &
working-directory: ./apps/api
id: start_workers
- name: Wait for the application to be ready
run: |
sleep 10
- name: Run E2E tests
run: |
npm run test:prod
@ -338,6 +342,7 @@ jobs:
build-and-publish-rust-sdk:
name: Build and publish Rust SDK
runs-on: ubuntu-latest
needs: deploy
steps:
- name: Checkout repository

View File

@ -14,10 +14,9 @@
<a href="https://GitHub.com/mendableai/firecrawl/graphs/contributors">
<img src="https://img.shields.io/github/contributors/mendableai/firecrawl.svg" alt="GitHub Contributors">
</a>
<a href="https://github.com/mendableai/firecrawl">
<img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source">
<a href="https://firecrawl.dev">
<img src="https://img.shields.io/badge/Visit-firecrawl.dev-orange" alt="Visit firecrawl.dev">
</a>
</div>
<div>
<p align="center">
@ -391,7 +390,7 @@ With LLM extraction, you can easily extract structured data from any URL. We sup
from firecrawl.firecrawl import FirecrawlApp
app = FirecrawlApp(api_key="fc-YOUR_API_KEY", version="v0")
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
class ArticleSchema(BaseModel):
title: str
@ -466,8 +465,7 @@ import FirecrawlApp from "@mendable/firecrawl-js";
import { z } from "zod";
const app = new FirecrawlApp({
apiKey: "fc-YOUR_API_KEY",
version: "v0"
apiKey: "fc-YOUR_API_KEY"
});
// Define schema to extract contents into

View File

@ -1,11 +1,11 @@
import request from "supertest";
import dotenv from "dotenv";
import { configDotenv } from "dotenv";
import {
ScrapeRequest,
ScrapeResponseRequestTest,
} from "../../controllers/v1/types";
dotenv.config();
configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for v1 API Routes", () => {
@ -22,6 +22,13 @@ describe("E2E Tests for v1 API Routes", () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
"/is-production"
);
console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION);
console.log('?', process.env.USE_DB_AUTHENTICATION === 'true');
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
console.log('!!useDbAuthentication', !!useDbAuthentication);
console.log('!useDbAuthentication', !useDbAuthentication);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("isProduction");
});
@ -29,9 +36,10 @@ describe("E2E Tests for v1 API Routes", () => {
describe("POST /v1/scrape", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
"/v1/scrape"
);
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/scrape")
.send({ url: "https://firecrawl.dev"})
expect(response.statusCode).toBe(401);
});
@ -389,7 +397,7 @@ describe("E2E Tests for v1 API Routes", () => {
const scrapeRequest: ScrapeRequest = {
url: "https://ycombinator.com/companies",
formats: ["markdown"],
waitFor: 5000
waitFor: 8000
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -451,9 +459,9 @@ describe("E2E Tests for v1 API Routes", () => {
describe("POST /v1/map", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
"/v1/map"
);
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
@ -534,7 +542,9 @@ describe("POST /v1/map", () => {
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("docs.firecrawl.dev");
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
expect(containsDocsFirecrawlDev).toBe(true);
});
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
@ -559,7 +569,9 @@ describe("POST /v1/map", () => {
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("docs.firecrawl.dev");
const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
expect(containsDocsFirecrawlDev).toBe(true);
}, 10000)
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
@ -609,9 +621,9 @@ describe("POST /v1/map", () => {
describe("POST /v1/crawl", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
"/v1/crawl"
);
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/crawl")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
@ -863,7 +875,7 @@ describe("GET /v1/crawl/:jobId", () => {
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://docs.mendable.ai" });
.send({ url: "https://docs.firecrawl.dev" });
expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false;
@ -893,9 +905,7 @@ describe("GET /v1/crawl/:jobId", () => {
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].metadata.statusCode).toBe(
200
);
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(
completedResponse.body.data[0].metadata.error
).toBeUndefined();

View File

@ -659,7 +659,7 @@ describe("E2E Tests for v0 API Routes", () => {
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: "https://mendable.ai/blog" });
.send({ url: "https://firecrawl.dev/blog" });
expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false;
@ -689,10 +689,8 @@ describe("E2E Tests for v0 API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
expect(completedResponse.body.data[0].content).toContain("Mendable");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
200
);
expect(completedResponse.body.data[0].content).toContain("Firecrawl");
expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(
completedResponse.body.data[0].metadata.pageError
).toBeUndefined();
@ -701,7 +699,7 @@ describe("E2E Tests for v0 API Routes", () => {
(doc) =>
doc.metadata &&
doc.metadata.sourceURL &&
doc.metadata.sourceURL.includes("mendable.ai/blog")
doc.metadata.sourceURL.includes("firecrawl.dev/blog")
);
expect(childrenLinks.length).toBe(completedResponse.body.data.length);

View File

@ -5,6 +5,8 @@ import { supabase_service } from "../../../src/services/supabase";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
export async function crawlCancelController(req: Request, res: Response) {
try {

View File

@ -6,6 +6,8 @@ import { Logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
export async function getJobs(ids: string[]) {
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);

View File

@ -244,14 +244,10 @@ export async function scrapeController(req: Request, res: Response) {
}
if (creditsToBeBilled > 0) {
// billing for doc done on queue end, bill only for llm extraction
const billingResult = await billTeam(team_id, creditsToBeBilled);
if (!billingResult.success) {
return res.status(402).json({
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
});
}
billTeam(team_id, creditsToBeBilled).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
}
}

View File

@ -54,18 +54,10 @@ export async function searchHelper(
if (justSearch) {
const billingResult = await billTeam(
team_id,
res.length
);
if (!billingResult.success) {
return {
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
returnCode: 402,
};
}
billTeam(team_id, res.length).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
return { success: true, data: res, returnCode: 200 };
}

View File

@ -5,6 +5,8 @@ import { supabase_service } from "../../services/supabase";
import { Logger } from "../../lib/logger";
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
export async function crawlCancelController(req: Request, res: Response) {
try {

View File

@ -3,6 +3,8 @@ import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentCo
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
import { configDotenv } from "dotenv";
configDotenv();
export async function getJob(id: string) {
const job = await getScrapeQueue().getJob(id);

View File

@ -18,6 +18,7 @@ import { fireEngineMap } from "../../search/fireEngine";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { performCosineSimilarity } from "../../lib/map-cosine";
import { Logger } from "../../lib/logger";
configDotenv();
@ -61,8 +62,8 @@ export async function mapController(
: `site:${req.body.url}`;
// www. seems to exclude subdomains in some cases
const mapResults = await fireEngineMap(mapUrl, {
// limit to 50 results (beta)
numResults: Math.min(limit, 50),
// limit to 100 results (beta)
numResults: Math.min(limit, 100),
});
if (mapResults.length > 0) {
@ -100,7 +101,10 @@ export async function mapController(
// remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links);
await billTeam(req.auth.team_id, 1);
billTeam(req.auth.team_id, 1).catch(error => {
Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
@ -127,5 +131,6 @@ export async function mapController(
return res.status(200).json({
success: true,
links: linksToReturn,
scrape_id: req.body.origin?.includes("website") ? id : undefined,
});
}

View File

@ -106,14 +106,10 @@ export async function scrapeController(
creditsToBeBilled = 50;
}
const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled);
if (!billingResult.success) {
return res.status(402).json({
success: false,
error:
"Failed to bill team. Insufficient credits or subscription not found.",
});
}
billTeam(req.auth.team_id, creditsToBeBilled).catch(error => {
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && doc.rawHtml) {
@ -147,5 +143,6 @@ export async function scrapeController(
return res.status(200).json({
success: true,
data: legacyDocumentConverter(doc),
scrape_id: origin?.includes("website") ? jobId : undefined,
});
}

View File

@ -225,6 +225,7 @@ export type ScrapeResponse =
success: true;
warning?: string;
data: Document;
scrape_id?: string;
};
export interface ScrapeResponseRequestTest {
@ -246,6 +247,7 @@ export type MapResponse =
| {
success: true;
links: string[];
scrape_id?: string;
};
export type CrawlStatusParams = {

View File

@ -1,3 +1,6 @@
import { configDotenv } from "dotenv";
configDotenv();
enum LogLevel {
NONE = 'NONE', // No logs will be output.
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
@ -25,7 +28,8 @@ export class Logger {
const color = Logger.colors[level];
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
// if (process.env.USE_DB_AUTH) {
// const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
// if (useDbAuthentication) {
// save to supabase? another place?
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
// }

View File

@ -2,6 +2,8 @@ import { Job } from "bullmq";
import type { baseScrapers } from "../scraper/WebScraper/single_url";
import { supabase_service as supabase } from "../services/supabase";
import { Logger } from "./logger";
import { configDotenv } from "dotenv";
configDotenv();
export type ScrapeErrorEvent = {
type: "error",
@ -36,7 +38,8 @@ export class ScrapeEvents {
static async insert(jobId: string, content: ScrapeEvent) {
if (jobId === "TEST") return null;
if (process.env.USE_DB_AUTHENTICATION) {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (useDbAuthentication) {
try {
const result = await supabase.from("scrape_events").insert({
job_id: jobId,

View File

@ -1,5 +1,8 @@
import { AuthResponse } from "../../src/types";
import { Logger } from "./logger";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
let warningCount = 0;
@ -7,7 +10,8 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
originalFunction: (...args: U) => Promise<T>
) {
return async function (...args: U): Promise<T> {
if (process.env.USE_DB_AUTHENTICATION === "false") {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
if (warningCount < 5) {
Logger.warn("You're bypassing authentication");
warningCount++;
@ -17,6 +21,7 @@ export function withAuth<T extends AuthResponse, U extends any[]>(
try {
return await originalFunction(...args);
} catch (error) {
Sentry.captureException(error);
Logger.error(`Error in withAuth function: ${error}`);
return { success: false, error: error.message } as T;
}

View File

@ -12,6 +12,8 @@ import { Document } from "../lib/entities";
import { supabase_service } from "../services/supabase";
import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
import { configDotenv } from "dotenv";
configDotenv();
export async function startWebScraperPipeline({
job,
@ -118,15 +120,10 @@ export async function runWebScraper({
: docs;
if(is_scrape === false) {
const billingResult = await billTeam(team_id, filteredDocs.length);
if (!billingResult.success) {
// throw new Error("Failed to bill team, no subscription was found");
return {
success: false,
message: "Failed to bill team, no subscription was found",
docs: [],
};
}
billTeam(team_id, filteredDocs.length).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here
});
}
@ -144,7 +141,8 @@ export async function runWebScraper({
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
try {
if (process.env.USE_DB_AUTHENTICATION === "true") {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (useDbAuthentication) {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.update({ docs: result })

View File

@ -33,7 +33,9 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
if (!success) {
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
return res.status(402).json({ success: false, error: "Insufficient credits" });
if (!res.headersSent) {
return res.status(402).json({ success: false, error: "Insufficient credits" });
}
}
req.account = { remainingCredits }
next();
@ -52,7 +54,9 @@ export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestW
);
if (!success) {
return res.status(status).json({ success: false, error });
if (!res.headersSent) {
return res.status(status).json({ success: false, error });
}
}
req.auth = { team_id, plan };
@ -67,7 +71,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ success: false, error: "Idempotency key already used" });
if (!res.headersSent) {
return res.status(409).json({ success: false, error: "Idempotency key already used" });
}
}
createIdempotencyKey(req);
}
@ -78,7 +84,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (req.body.url && isUrlBlocked(req.body.url)) {
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
if (!res.headersSent) {
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
}
}
next();
}
@ -96,26 +104,26 @@ export const v1Router = express.Router();
v1Router.post(
"/scrape",
blocklistMiddleware,
authMiddleware(RateLimiterMode.Scrape),
checkCreditsMiddleware(1),
blocklistMiddleware,
wrap(scrapeController)
);
v1Router.post(
"/crawl",
blocklistMiddleware,
authMiddleware(RateLimiterMode.Crawl),
idempotencyMiddleware,
checkCreditsMiddleware(),
blocklistMiddleware,
idempotencyMiddleware,
wrap(crawlController)
);
v1Router.post(
"/map",
blocklistMiddleware,
authMiddleware(RateLimiterMode.Map),
checkCreditsMiddleware(1),
blocklistMiddleware,
wrap(mapController)
);

View File

@ -23,12 +23,15 @@ import { clientSideError } from "../../strings";
dotenv.config();
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
export const baseScrapers = [
"fire-engine;chrome-cdp",
"fire-engine",
"scrapingBee",
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
"scrapingBeeLoad",
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
useFireEngine ? "fire-engine" : undefined,
useScrapingBee ? "scrapingBee" : undefined,
useFireEngine ? undefined : "playwright",
useScrapingBee ? "scrapingBeeLoad" : undefined,
"fetch",
].filter(Boolean);
@ -85,18 +88,18 @@ function getScrapingFallbackOrder(
});
let defaultOrder = [
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
!process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
"scrapingBee",
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
"scrapingBeeLoad",
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
useFireEngine ? "fire-engine" : undefined,
useScrapingBee ? "scrapingBee" : undefined,
useScrapingBee ? "scrapingBeeLoad" : undefined,
useFireEngine ? undefined : "playwright",
"fetch",
].filter(Boolean);
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
defaultOrder = [
"fire-engine",
process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
useFireEngine ? undefined : "playwright",
...defaultOrder.filter(
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
),

View File

@ -5,7 +5,7 @@ import { supabase_service } from "../supabase";
import { Logger } from "../../lib/logger";
import { getValue, setValue } from "../redis";
import { redlock } from "../redlock";
import * as Sentry from "@sentry/node";
const FREE_CREDITS = 500;
@ -40,14 +40,15 @@ export async function supaBillTeam(team_id: string, credits: number) {
]);
let couponCredits = 0;
let sortedCoupons = [];
if (coupons && coupons.length > 0) {
couponCredits = coupons.reduce(
(total, coupon) => total + coupon.credits,
0
);
sortedCoupons = [...coupons].sort((a, b) => b.credits - a.credits);
}
let sortedCoupons = coupons.sort((a, b) => b.credits - a.credits);
// using coupon credits:
if (couponCredits > 0) {
// if there is no subscription and they have enough coupon credits
@ -175,9 +176,24 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
}
// Retrieve the team's active subscription and check for available coupons concurrently
const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
await Promise.all([
let cacheKeySubscription = `subscription_${team_id}`;
let cacheKeyCoupons = `coupons_${team_id}`;
// Try to get data from cache first
const [cachedSubscription, cachedCoupons] = await Promise.all([
getValue(cacheKeySubscription),
getValue(cacheKeyCoupons)
]);
let subscription, subscriptionError, coupons;
if (cachedSubscription && cachedCoupons) {
subscription = JSON.parse(cachedSubscription);
coupons = JSON.parse(cachedCoupons);
} else {
// If not in cache, retrieve from database
const [subscriptionResult, couponsResult] = await Promise.all([
supabase_service
.from("subscriptions")
.select("id, price_id, current_period_start, current_period_end")
@ -191,6 +207,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
.eq("status", "active"),
]);
subscription = subscriptionResult.data;
subscriptionError = subscriptionResult.error;
coupons = couponsResult.data;
// Cache the results for a minute, sub can be null and that's fine
await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null
await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute
}
let couponCredits = 0;
if (coupons && coupons.length > 0) {
couponCredits = coupons.reduce(
@ -211,41 +237,54 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
let creditUsages;
let creditUsageError;
let retries = 0;
const maxRetries = 3;
const retryInterval = 2000; // 2 seconds
let totalCreditsUsed = 0;
const cacheKeyCreditUsage = `credit_usage_${team_id}`;
while (retries < maxRetries) {
const result = await supabase_service
.from("credit_usage")
.select("credits_used")
.is("subscription_id", null)
.eq("team_id", team_id);
// Try to get credit usage from cache
const cachedCreditUsage = await getValue(cacheKeyCreditUsage);
creditUsages = result.data;
creditUsageError = result.error;
if (cachedCreditUsage) {
totalCreditsUsed = parseInt(cachedCreditUsage);
} else {
let retries = 0;
const maxRetries = 3;
const retryInterval = 2000; // 2 seconds
if (!creditUsageError) {
break;
while (retries < maxRetries) {
const result = await supabase_service
.from("credit_usage")
.select("credits_used")
.is("subscription_id", null)
.eq("team_id", team_id);
creditUsages = result.data;
creditUsageError = result.error;
if (!creditUsageError) {
break;
}
retries++;
if (retries < maxRetries) {
await new Promise(resolve => setTimeout(resolve, retryInterval));
}
}
retries++;
if (retries < maxRetries) {
await new Promise(resolve => setTimeout(resolve, retryInterval));
if (creditUsageError) {
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
throw new Error(
`Failed to retrieve credit usage for team_id: ${team_id}`
);
}
}
if (creditUsageError) {
Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
throw new Error(
`Failed to retrieve credit usage for team_id: ${team_id}`
totalCreditsUsed = creditUsages.reduce(
(acc, usage) => acc + usage.credits_used,
0
);
}
const totalCreditsUsed = creditUsages.reduce(
(acc, usage) => acc + usage.credits_used,
0
);
// Cache the result for 30 seconds
await setValue(cacheKeyCreditUsage, totalCreditsUsed.toString(), 30);
}
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
@ -255,7 +294,9 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
const creditLimit = FREE_CREDITS;
const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit;
if (creditUsagePercentage >= 0.8) {
// Add a check to ensure totalCreditsUsed is greater than 0
if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
Logger.info(`Sending notification for team ${team_id}. Total credits used: ${totalCreditsUsed}, Credit usage percentage: ${creditUsagePercentage}`);
await sendNotification(
team_id,
NotificationType.APPROACHING_LIMIT,
@ -309,7 +350,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (creditUsages && creditUsages.length > 0) {
totalCreditsUsed = creditUsages[0].total_credits_used;
await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes
await setValue(cacheKey, totalCreditsUsed.toString(), 500); // Cache for 8 minutes
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
}
}
@ -322,17 +363,38 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
// Adjust total credits used by subtracting coupon value
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
// Get the price details
const { data: price, error: priceError } = await supabase_service
.from("prices")
.select("credits")
.eq("id", subscription.price_id)
.single();
if (priceError) {
throw new Error(
`Failed to retrieve price for price_id: ${subscription.price_id}`
);
// Get the price details from cache or database
const priceCacheKey = `price_${subscription.price_id}`;
let price;
try {
const cachedPrice = await getValue(priceCacheKey);
if (cachedPrice) {
price = JSON.parse(cachedPrice);
} else {
const { data, error: priceError } = await supabase_service
.from("prices")
.select("credits")
.eq("id", subscription.price_id)
.single();
if (priceError) {
throw new Error(
`Failed to retrieve price for price_id: ${subscription.price_id}`
);
}
price = data;
// There are only 21 records, so this is super fine
// Cache the price for a long time (e.g., 1 day)
await setValue(priceCacheKey, JSON.stringify(price), 86400);
}
} catch (error) {
Logger.error(`Error retrieving or caching price: ${error}`);
Sentry.captureException(error);
// If errors, just assume it's a big number so user don't get an error
price = { credits: 1000000 };
}
const creditLimit = price.credits;
@ -462,8 +524,8 @@ async function createCreditUsage({
subscription_id?: string;
credits: number;
}) {
const { data: credit_usage } = await supabase_service
.from("credit_usage")
await supabase_service
.from("credit_usage")
.insert([
{
team_id,
@ -471,8 +533,7 @@ async function createCreditUsage({
subscription_id: subscription_id || null,
created_at: new Date(),
},
])
.select();
]);
return { success: true, credit_usage };
return { success: true };
}

View File

@ -1,9 +1,11 @@
import { supabase_service } from "../supabase";
import { Logger } from "../../../src/lib/logger";
import "dotenv/config";
import { configDotenv } from "dotenv";
configDotenv();
export async function logCrawl(job_id: string, team_id: string) {
if (process.env.USE_DB_AUTHENTICATION === 'true') {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (useDbAuthentication) {
try {
const { data, error } = await supabase_service
.from("bulljobs_teams")

View File

@ -4,10 +4,13 @@ import { FirecrawlJob } from "../../types";
import { posthog } from "../posthog";
import "dotenv/config";
import { Logger } from "../../lib/logger";
import { configDotenv } from "dotenv";
configDotenv();
export async function logJob(job: FirecrawlJob) {
try {
if (process.env.USE_DB_AUTHENTICATION === "false") {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
return;
}

View File

@ -3,12 +3,15 @@ import { ScrapeLog } from "../../types";
import { supabase_service } from "../supabase";
import { PageOptions } from "../../lib/entities";
import { Logger } from "../../lib/logger";
import { configDotenv } from "dotenv";
configDotenv();
export async function logScrape(
scrapeLog: ScrapeLog,
pageOptions?: PageOptions
) {
if (process.env.USE_DB_AUTHENTICATION === "false") {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
Logger.debug("Skipping logging scrape to Supabase");
return;
}

View File

@ -67,6 +67,6 @@ export function waitForJob(jobId: string, timeout: number) {
reject((await getScrapeQueue().getJob(jobId)).failedReason);
}
}
}, 1000);
}, 500);
})
}

View File

@ -36,6 +36,8 @@ import {
} from "../../src/lib/job-priority";
import { PlanType } from "../types";
import { getJobs } from "../../src/controllers/v1/crawl-status";
import { configDotenv } from "dotenv";
configDotenv();
if (process.env.ENV === "production") {
initSDK({

View File

@ -1,5 +1,7 @@
import { createClient, SupabaseClient } from "@supabase/supabase-js";
import { Logger } from "../lib/logger";
import { configDotenv } from "dotenv";
configDotenv();
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
class SupabaseService {
@ -8,8 +10,9 @@ class SupabaseService {
constructor() {
const supabaseUrl = process.env.SUPABASE_URL;
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
// Only initialize the Supabase client if both URL and Service Token are provided.
if (process.env.USE_DB_AUTHENTICATION === "false") {
if (!useDbAuthentication) {
// Warn the user that Authentication is disabled by setting the client to null
Logger.warn(
"Authentication is disabled. Supabase client will not be initialized."

View File

@ -3,6 +3,8 @@ import { legacyDocumentConverter } from "../../src/controllers/v1/types";
import { Logger } from "../../src/lib/logger";
import { supabase_service } from "./supabase";
import { WebhookEventType } from "../types";
import { configDotenv } from "dotenv";
configDotenv();
export const callWebhook = async (
teamId: string,

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.2.1",
"version": "1.2.2",
"description": "JavaScript SDK for Firecrawl API",
"main": "build/cjs/index.js",
"types": "types/index.d.ts",

View File

@ -454,20 +454,27 @@ export default class FirecrawlApp {
checkInterval: number
): Promise<CrawlStatusResponse> {
while (true) {
const statusResponse: AxiosResponse = await this.getRequest(
let statusResponse: AxiosResponse = await this.getRequest(
`${this.apiUrl}/v1/crawl/${id}`,
headers
);
if (statusResponse.status === 200) {
const statusData = statusResponse.data;
let statusData = statusResponse.data;
if (statusData.status === "completed") {
if ("data" in statusData) {
let data = statusData.data;
while ('next' in statusData) {
statusResponse = await this.getRequest(statusData.next, headers);
statusData = statusResponse.data;
data = data.concat(statusData.data);
}
statusData.data = data;
return statusData;
} else {
throw new Error("Crawl job completed but no data was returned");
}
} else if (
["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
) {
checkInterval = Math.max(checkInterval, 2);
await new Promise((resolve) =>

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp
__version__ = "1.2.1"
__version__ = "1.2.3"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -238,7 +238,6 @@ class FirecrawlApp:
)
if response.status_code == 200:
response = response.json()
print(response)
if response['success'] and 'links' in response:
return response['links']
else:
@ -346,6 +345,12 @@ class FirecrawlApp:
status_data = status_response.json()
if status_data['status'] == 'completed':
if 'data' in status_data:
data = status_data['data']
while 'next' in status_data:
status_response = self._get_request(status_data['next'], headers)
status_data = status_response.json()
data.extend(status_data['data'])
status_data['data'] = data
return status_data
else:
raise Exception('Crawl job completed but no data was returned')

View File

@ -1,5 +1,6 @@
import { createClient, SupabaseClient } from "@supabase/supabase-js";
import "dotenv/config";
import { configDotenv } from "dotenv";
configDotenv();
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
class SupabaseService {
@ -9,7 +10,8 @@ class SupabaseService {
const supabaseUrl = process.env.SUPABASE_URL;
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
// Only initialize the Supabase client if both URL and Service Token are provided.
if (process.env.USE_DB_AUTHENTICATION === "false") {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
// Warn the user that Authentication is disabled by setting the client to null
console.warn(
"Authentication is disabled. Supabase client will not be initialized."
@ -36,7 +38,8 @@ export const supabase_service: SupabaseClient = new Proxy(
new SupabaseService(),
{
get: function (target, prop, receiver) {
if (process.env.USE_DB_AUTHENTICATION === "false") {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) {
console.debug(
"Attempted to access Supabase client when it's not configured."
);