Merge branch 'v1-webscraper' of https://github.com/mendableai/firecrawl into v1-webscraper

This commit is contained in:
rafaelsideguide 2024-08-26 19:22:05 -03:00
commit adc3e4233d
16 changed files with 43 additions and 244 deletions

1
.gitignore vendored
View File

@ -20,3 +20,4 @@ apps/playwright-service-ts/node_modules/
apps/playwright-service-ts/package-lock.json apps/playwright-service-ts/package-lock.json
*.pyc *.pyc
.rdb

View File

@ -1,6 +1,6 @@
import { crawlController } from '../v0/crawl' import { crawlController } from '../v0/crawl'
import { Request, Response } from 'express'; import { Request, Response } from 'express';
import { authenticateUser } from '../v0/auth'; // Ensure this import is correct import { authenticateUser } from '../auth'; // Ensure this import is correct
import { createIdempotencyKey } from '../../services/idempotency/create'; import { createIdempotencyKey } from '../../services/idempotency/create';
import { validateIdempotencyKey } from '../../services/idempotency/validate'; import { validateIdempotencyKey } from '../../services/idempotency/validate';
import { v4 as uuidv4 } from 'uuid'; import { v4 as uuidv4 } from 'uuid';

View File

@ -1,19 +1,19 @@
import { parseApi } from "../../../src/lib/parseApi"; import { parseApi } from "../lib/parseApi";
import { getRateLimiter } from "../../../src/services/rate-limiter"; import { getRateLimiter } from "../services/rate-limiter";
import { import {
AuthResponse, AuthResponse,
NotificationType, NotificationType,
RateLimiterMode, RateLimiterMode,
} from "../../../src/types"; } from "../types";
import { supabase_service } from "../../../src/services/supabase"; import { supabase_service } from "../services/supabase";
import { withAuth } from "../../../src/lib/withAuth"; import { withAuth } from "../lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible"; import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from "@hyperdx/node-opentelemetry"; import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
import { sendNotification } from "../../services/notification/email_notification"; import { sendNotification } from "../services/notification/email_notification";
import { Logger } from "../../lib/logger"; import { Logger } from "../lib/logger";
import { redlock } from "../../../src/services/redlock"; import { redlock } from "../services/redlock";
import { getValue } from "../../../src/services/redis"; import { getValue } from "../services/redis";
import { setValue } from "../../../src/services/redis"; import { setValue } from "../services/redis";
import { validate } from "uuid"; import { validate } from "uuid";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";

View File

@ -1,5 +1,5 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { authenticateUser } from "./auth"; import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types"; import { RateLimiterMode } from "../../../src/types";
import { supabase_service } from "../../../src/services/supabase"; import { supabase_service } from "../../../src/services/supabase";
import { Logger } from "../../../src/lib/logger"; import { Logger } from "../../../src/lib/logger";

View File

@ -1,5 +1,5 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { authenticateUser } from "./auth"; import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types"; import { RateLimiterMode } from "../../../src/types";
import { getScrapeQueue } from "../../../src/services/queue-service"; import { getScrapeQueue } from "../../../src/services/queue-service";
import { Logger } from "../../../src/lib/logger"; import { Logger } from "../../../src/lib/logger";

View File

@ -1,6 +1,6 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { checkTeamCredits } from "../../../src/services/billing/credit_billing"; import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth"; import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types"; import { RateLimiterMode } from "../../../src/types";
import { addScrapeJob } from "../../../src/services/queue-jobs"; import { addScrapeJob } from "../../../src/services/queue-jobs";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";

View File

@ -1,5 +1,5 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { authenticateUser } from "./auth"; import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types"; import { RateLimiterMode } from "../../../src/types";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";

View File

@ -2,7 +2,7 @@
import { AuthResponse, RateLimiterMode } from "../../types"; import { AuthResponse, RateLimiterMode } from "../../types";
import { Request, Response } from "express"; import { Request, Response } from "express";
import { authenticateUser } from "./auth"; import { authenticateUser } from "../auth";
export const keyAuthController = async (req: Request, res: Response) => { export const keyAuthController = async (req: Request, res: Response) => {

View File

@ -1,7 +1,7 @@
import { ExtractorOptions, PageOptions } from './../../lib/entities'; import { ExtractorOptions, PageOptions } from './../../lib/entities';
import { Request, Response } from "express"; import { Request, Response } from "express";
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
import { authenticateUser } from "./auth"; import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../types"; import { RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { Document } from "../../lib/entities"; import { Document } from "../../lib/entities";

View File

@ -1,7 +1,7 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { WebScraperDataProvider } from "../../scraper/WebScraper"; import { WebScraperDataProvider } from "../../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
import { authenticateUser } from "./auth"; import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../types"; import { RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { PageOptions, SearchOptions } from "../../lib/entities"; import { PageOptions, SearchOptions } from "../../lib/entities";

View File

@ -1,222 +0,0 @@
import { parseApi } from "../../../src/lib/parseApi";
import { getRateLimiter, } from "../../../src/services/rate-limiter";
import { AuthResponse, NotificationType, RateLimiterMode } from "../../../src/types";
import { supabase_service } from "../../../src/services/supabase";
import { withAuth } from "../../../src/lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
import { sendNotification } from "../../../src/services/notification/email_notification";
import { Logger } from "../../../src/lib/logger";
export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
return withAuth(supaAuthenticateUser)(req, res, mode);
}
function setTrace(team_id: string, api_key: string) {
try {
setTraceAttributes({
team_id,
api_key
});
} catch (error) {
Logger.error(`Error setting trace attributes: ${error.message}`);
}
}
export async function supaAuthenticateUser(
req,
res,
mode?: RateLimiterMode
): Promise<AuthResponse> {
const authHeader = req.headers.authorization;
if (!authHeader) {
return { success: false, error: "Unauthorized", status: 401 };
}
const token = authHeader.split(" ")[1]; // Extract the token from "Bearer <token>"
if (!token) {
return {
success: false,
error: "Unauthorized: Token missing",
status: 401,
};
}
const incomingIP = (req.headers["x-forwarded-for"] ||
req.socket.remoteAddress) as string;
const iptoken = incomingIP + token;
let rateLimiter: RateLimiterRedis;
let subscriptionData: { team_id: string, plan: string } | null = null;
let normalizedApi: string;
let team_id: string;
if (token == "this_is_just_a_preview_token") {
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
team_id = "preview";
} else {
normalizedApi = parseApi(token);
const { data, error } = await supabase_service.rpc(
'get_key_and_price_id_2', { api_key: normalizedApi }
);
// get_key_and_price_id_2 rpc definition:
// create or replace function get_key_and_price_id_2(api_key uuid)
// returns table(key uuid, team_id uuid, price_id text) as $$
// begin
// if api_key is null then
// return query
// select null::uuid as key, null::uuid as team_id, null::text as price_id;
// end if;
// return query
// select ak.key, ak.team_id, s.price_id
// from api_keys ak
// left join subscriptions s on ak.team_id = s.team_id
// where ak.key = api_key;
// end;
// $$ language plpgsql;
if (error) {
Logger.warn(`Error fetching key and price_id: ${error.message}`);
} else {
// console.log('Key and Price ID:', data);
}
if (error || !data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401,
};
}
const internal_team_id = data[0].team_id;
team_id = internal_team_id;
const plan = getPlanByPriceId(data[0].price_id);
// HyperDX Logging
setTrace(team_id, normalizedApi);
subscriptionData = {
team_id: team_id,
plan: plan,
}
switch (mode) {
case RateLimiterMode.Crawl:
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
break;
case RateLimiterMode.Scrape:
rateLimiter = getRateLimiter(RateLimiterMode.Scrape, token, subscriptionData.plan);
break;
case RateLimiterMode.Search:
rateLimiter = getRateLimiter(RateLimiterMode.Search, token, subscriptionData.plan);
break;
case RateLimiterMode.CrawlStatus:
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
break;
case RateLimiterMode.Map:
rateLimiter = getRateLimiter(RateLimiterMode.Map, token);
break;
case RateLimiterMode.Preview:
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
break;
default:
rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token);
break;
// case RateLimiterMode.Search:
// rateLimiter = await searchRateLimiter(RateLimiterMode.Search, token);
// break;
}
}
const team_endpoint_token = token === "this_is_just_a_preview_token" ? iptoken : team_id;
try {
await rateLimiter.consume(team_endpoint_token);
} catch (rateLimiterRes) {
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
// We can only send a rate limit email every 7 days, send notification already has the date in between checking
const startDate = new Date();
const endDate = new Date();
endDate.setDate(endDate.getDate() + 7);
// await sendNotification(team_id, NotificationType.RATE_LIMIT_REACHED, startDate.toISOString(), endDate.toISOString());
return {
success: false,
error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Upgrade your plan at https://firecrawl.dev/pricing for increased rate limits or please retry after ${secs}s, resets at ${retryDate}`,
status: 429,
};
}
if (
token === "this_is_just_a_preview_token" &&
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search || mode === RateLimiterMode.Map)
) {
return { success: true, team_id: "preview" };
// check the origin of the request and make sure its from firecrawl.dev
// const origin = req.headers.origin;
// if (origin && origin.includes("firecrawl.dev")){
// return { success: true, team_id: "preview" };
// }
// if(process.env.ENV !== "production") {
// return { success: true, team_id: "preview" };
// }
// return { success: false, error: "Unauthorized: Invalid token", status: 401 };
}
// make sure api key is valid, based on the api_keys table in supabase
if (!subscriptionData) {
normalizedApi = parseApi(token);
const { data, error } = await supabase_service
.from("api_keys")
.select("*")
.eq("key", normalizedApi);
if (error || !data || data.length === 0) {
Logger.warn(`Error fetching api key: ${error.message} or data is empty`);
return {
success: false,
error: "Unauthorized: Invalid token",
status: 401,
};
}
subscriptionData = data[0];
}
return {
success: true,
team_id: subscriptionData.team_id,
plan: subscriptionData.plan ?? "",
api_key: normalizedApi
};
}
function getPlanByPriceId(price_id: string) {
switch (price_id) {
case process.env.STRIPE_PRICE_ID_STARTER:
return 'starter';
case process.env.STRIPE_PRICE_ID_STANDARD:
return 'standard';
case process.env.STRIPE_PRICE_ID_SCALE:
return 'scale';
case process.env.STRIPE_PRICE_ID_HOBBY:
case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY:
return 'hobby';
case process.env.STRIPE_PRICE_ID_STANDARD_NEW:
case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY:
return 'standardnew';
case process.env.STRIPE_PRICE_ID_GROWTH:
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
return 'growth';
default:
return 'free';
}
}

View File

@ -1,6 +1,6 @@
import { authMiddleware } from "../../routes/v1"; import { authMiddleware } from "../../routes/v1";
import { RateLimiterMode } from "../../types"; import { RateLimiterMode } from "../../types";
import { authenticateUser } from "../v0/auth"; import { authenticateUser } from "../auth";
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types"; import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
import { WebSocket } from "ws"; import { WebSocket } from "ws";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";

View File

@ -34,10 +34,30 @@ export async function crawlController(
const { remainingCredits } = req.account; const { remainingCredits } = req.account;
// TODO: Get rid of crawlerOptions
const crawlerOptions = legacyCrawlerOptions(req.body); const crawlerOptions = legacyCrawlerOptions(req.body);
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions); const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
// TODO: @rafa, is this right? copied from v0
if (Array.isArray(crawlerOptions.includes)) {
for (const x of crawlerOptions.includes) {
try {
new RegExp(x);
} catch (e) {
return res.status(400).json({ success: false, error: e.message });
}
}
}
if (Array.isArray(crawlerOptions.excludes)) {
for (const x of crawlerOptions.excludes) {
try {
new RegExp(x);
} catch (e) {
return res.status(400).json({ success: false, error: e.message });
}
}
}
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
const sc: StoredCrawl = { const sc: StoredCrawl = {

View File

@ -97,7 +97,7 @@ export async function mapController(
logJob({ logJob({
job_id: id, job_id: id,
success: true, success: links.length > 0,
message: "Map completed", message: "Map completed",
num_docs: 1, num_docs: 1,
docs: links, docs: links,

View File

@ -6,7 +6,7 @@ import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
import { mapController } from "../../src/controllers/v1/map"; import { mapController } from "../../src/controllers/v1/map";
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types"; import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
import { RateLimiterMode } from "../types"; import { RateLimiterMode } from "../types";
import { authenticateUser } from "../controllers/v1/auth"; import { authenticateUser } from "../controllers/auth";
import { Logger } from "../lib/logger"; import { Logger } from "../lib/logger";
import { createIdempotencyKey } from "../services/idempotency/create"; import { createIdempotencyKey } from "../services/idempotency/create";
import { validateIdempotencyKey } from "../services/idempotency/validate"; import { validateIdempotencyKey } from "../services/idempotency/validate";

Binary file not shown.