mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 11:42:24 +08:00
Merge branch 'v1-webscraper' of https://github.com/mendableai/firecrawl into v1-webscraper
This commit is contained in:
commit
adc3e4233d
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -20,3 +20,4 @@ apps/playwright-service-ts/node_modules/
|
|||
apps/playwright-service-ts/package-lock.json
|
||||
|
||||
*.pyc
|
||||
.rdb
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { crawlController } from '../v0/crawl'
|
||||
import { Request, Response } from 'express';
|
||||
import { authenticateUser } from '../v0/auth'; // Ensure this import is correct
|
||||
import { authenticateUser } from '../auth'; // Ensure this import is correct
|
||||
import { createIdempotencyKey } from '../../services/idempotency/create';
|
||||
import { validateIdempotencyKey } from '../../services/idempotency/validate';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
|
|
|
@ -1,19 +1,19 @@
|
|||
import { parseApi } from "../../../src/lib/parseApi";
|
||||
import { getRateLimiter } from "../../../src/services/rate-limiter";
|
||||
import { parseApi } from "../lib/parseApi";
|
||||
import { getRateLimiter } from "../services/rate-limiter";
|
||||
import {
|
||||
AuthResponse,
|
||||
NotificationType,
|
||||
RateLimiterMode,
|
||||
} from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { withAuth } from "../../../src/lib/withAuth";
|
||||
} from "../types";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { withAuth } from "../lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
||||
import { sendNotification } from "../../services/notification/email_notification";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { redlock } from "../../../src/services/redlock";
|
||||
import { getValue } from "../../../src/services/redis";
|
||||
import { setValue } from "../../../src/services/redis";
|
||||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { redlock } from "../services/redlock";
|
||||
import { getValue } from "../services/redis";
|
||||
import { setValue } from "../services/redis";
|
||||
import { validate } from "uuid";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import { Request, Response } from "express";
|
||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
import { AuthResponse, RateLimiterMode } from "../../types";
|
||||
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
|
||||
|
||||
export const keyAuthController = async (req: Request, res: Response) => {
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import { ExtractorOptions, PageOptions } from './../../lib/entities';
|
||||
import { Request, Response } from "express";
|
||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { Document } from "../../lib/entities";
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../../lib/entities";
|
||||
|
|
|
@ -1,222 +0,0 @@
|
|||
import { parseApi } from "../../../src/lib/parseApi";
|
||||
import { getRateLimiter, } from "../../../src/services/rate-limiter";
|
||||
import { AuthResponse, NotificationType, RateLimiterMode } from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { withAuth } from "../../../src/lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
|
||||
import { sendNotification } from "../../../src/services/notification/email_notification";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
|
||||
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
|
||||
return withAuth(supaAuthenticateUser)(req, res, mode);
|
||||
}
|
||||
function setTrace(team_id: string, api_key: string) {
|
||||
try {
|
||||
setTraceAttributes({
|
||||
team_id,
|
||||
api_key
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Error setting trace attributes: ${error.message}`);
|
||||
}
|
||||
|
||||
}
|
||||
export async function supaAuthenticateUser(
|
||||
req,
|
||||
res,
|
||||
mode?: RateLimiterMode
|
||||
): Promise<AuthResponse> {
|
||||
const authHeader = req.headers.authorization;
|
||||
if (!authHeader) {
|
||||
return { success: false, error: "Unauthorized", status: 401 };
|
||||
}
|
||||
const token = authHeader.split(" ")[1]; // Extract the token from "Bearer <token>"
|
||||
if (!token) {
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Token missing",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
|
||||
const incomingIP = (req.headers["x-forwarded-for"] ||
|
||||
req.socket.remoteAddress) as string;
|
||||
const iptoken = incomingIP + token;
|
||||
|
||||
let rateLimiter: RateLimiterRedis;
|
||||
let subscriptionData: { team_id: string, plan: string } | null = null;
|
||||
let normalizedApi: string;
|
||||
|
||||
let team_id: string;
|
||||
|
||||
if (token == "this_is_just_a_preview_token") {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
team_id = "preview";
|
||||
} else {
|
||||
normalizedApi = parseApi(token);
|
||||
|
||||
const { data, error } = await supabase_service.rpc(
|
||||
'get_key_and_price_id_2', { api_key: normalizedApi }
|
||||
);
|
||||
// get_key_and_price_id_2 rpc definition:
|
||||
// create or replace function get_key_and_price_id_2(api_key uuid)
|
||||
// returns table(key uuid, team_id uuid, price_id text) as $$
|
||||
// begin
|
||||
// if api_key is null then
|
||||
// return query
|
||||
// select null::uuid as key, null::uuid as team_id, null::text as price_id;
|
||||
// end if;
|
||||
|
||||
// return query
|
||||
// select ak.key, ak.team_id, s.price_id
|
||||
// from api_keys ak
|
||||
// left join subscriptions s on ak.team_id = s.team_id
|
||||
// where ak.key = api_key;
|
||||
// end;
|
||||
// $$ language plpgsql;
|
||||
|
||||
if (error) {
|
||||
Logger.warn(`Error fetching key and price_id: ${error.message}`);
|
||||
} else {
|
||||
// console.log('Key and Price ID:', data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
const internal_team_id = data[0].team_id;
|
||||
team_id = internal_team_id;
|
||||
|
||||
const plan = getPlanByPriceId(data[0].price_id);
|
||||
// HyperDX Logging
|
||||
setTrace(team_id, normalizedApi);
|
||||
subscriptionData = {
|
||||
team_id: team_id,
|
||||
plan: plan,
|
||||
}
|
||||
switch (mode) {
|
||||
case RateLimiterMode.Crawl:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
|
||||
break;
|
||||
case RateLimiterMode.Scrape:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
|
||||
break;
|
||||
case RateLimiterMode.Search:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan);
|
||||
break;
|
||||
case RateLimiterMode.CrawlStatus:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
break;
|
||||
case RateLimiterMode.Map:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Map, token);
|
||||
break;
|
||||
|
||||
case RateLimiterMode.Preview:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
break;
|
||||
default:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token);
|
||||
break;
|
||||
// case RateLimiterMode.Search:
|
||||
// rateLimiter = await searchRateLimiter(RateLimiterMode.Search, token);
|
||||
// break;
|
||||
}
|
||||
}
|
||||
|
||||
const team_endpoint_token = token === "this_is_just_a_preview_token" ? iptoken : team_id;
|
||||
|
||||
try {
|
||||
await rateLimiter.consume(team_endpoint_token);
|
||||
} catch (rateLimiterRes) {
|
||||
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
|
||||
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
|
||||
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
|
||||
|
||||
// We can only send a rate limit email every 7 days, send notification already has the date in between checking
|
||||
const startDate = new Date();
|
||||
const endDate = new Date();
|
||||
endDate.setDate(endDate.getDate() + 7);
|
||||
// await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
|
||||
return {
|
||||
success: false,
|
||||
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
|
||||
status: 429,
|
||||
};
|
||||
}
|
||||
|
||||
if (
|
||||
token === "this_is_just_a_preview_token" &&
|
||||
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search || mode === RateLimiterMode.Map)
|
||||
) {
|
||||
return { success: true, team_id: "preview" };
|
||||
// check the origin of the request and make sure its from firecrawl.dev
|
||||
// const origin = req.headers.origin;
|
||||
// if (origin && origin.includes("firecrawl.dev")){
|
||||
// return { success: true, team_id: "preview" };
|
||||
// }
|
||||
// if(process.env.ENV !== "production") {
|
||||
// return { success: true, team_id: "preview" };
|
||||
// }
|
||||
|
||||
// return { success: false, error: "Unauthorized: Invalid token", status: 401 };
|
||||
}
|
||||
|
||||
// make sure api key is valid, based on the api_keys table in supabase
|
||||
if (!subscriptionData) {
|
||||
normalizedApi = parseApi(token);
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("api_keys")
|
||||
.select("*")
|
||||
.eq("key", normalizedApi);
|
||||
|
||||
|
||||
|
||||
if (error || !data || data.length === 0) {
|
||||
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
|
||||
return {
|
||||
success: false,
|
||||
error: "Unauthorized: Invalid token",
|
||||
status: 401,
|
||||
};
|
||||
}
|
||||
|
||||
subscriptionData = data[0];
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
team_id: subscriptionData.team_id,
|
||||
plan: subscriptionData.plan ?? "",
|
||||
api_key: normalizedApi
|
||||
};
|
||||
}
|
||||
function getPlanByPriceId(price_id: string) {
|
||||
switch (price_id) {
|
||||
case process.env.STRIPE_PRICE_ID_STARTER:
|
||||
return 'starter';
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD:
|
||||
return 'standard';
|
||||
case process.env.STRIPE_PRICE_ID_SCALE:
|
||||
return 'scale';
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY:
|
||||
case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
|
||||
return 'hobby';
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
|
||||
case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
|
||||
return 'standardnew';
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH:
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
|
||||
return 'growth';
|
||||
default:
|
||||
return 'free';
|
||||
}
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
import { authMiddleware } from "../../routes/v1";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { authenticateUser } from "../v0/auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { WebSocket } from "ws";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
|
|
|
@ -34,10 +34,30 @@ export async function crawlController(
|
|||
|
||||
const { remainingCredits } = req.account;
|
||||
|
||||
// TODO: Get rid of crawlerOptions
|
||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||
|
||||
// TODO: @rafa, is this right? copied from v0
|
||||
if (Array.isArray(crawlerOptions.includes)) {
|
||||
for (const x of crawlerOptions.includes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
return res.status(400).json({ success: false, error: e.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(crawlerOptions.excludes)) {
|
||||
for (const x of crawlerOptions.excludes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
return res.status(400).json({ success: false, error: e.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
|
|
|
@ -97,7 +97,7 @@ export async function mapController(
|
|||
|
||||
logJob({
|
||||
job_id: id,
|
||||
success: true,
|
||||
success: links.length > 0,
|
||||
message: "Map completed",
|
||||
num_docs: 1,
|
||||
docs: links,
|
||||
|
|
|
@ -6,7 +6,7 @@ import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
|||
import { mapController } from "../../src/controllers/v1/map";
|
||||
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { authenticateUser } from "../controllers/v1/auth";
|
||||
import { authenticateUser } from "../controllers/auth";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { createIdempotencyKey } from "../services/idempotency/create";
|
||||
import { validateIdempotencyKey } from "../services/idempotency/validate";
|
||||
|
|
BIN
temp-37564.rdb
BIN
temp-37564.rdb
Binary file not shown.
Loading…
Reference in New Issue
Block a user