WebScraper refactor into scrapeURL (#714)

* feat: use strictNullChecking

* feat: switch logger to Winston

* feat(scrapeURL): first batch

* fix(scrapeURL): error swallow

* fix(scrapeURL): add timeout to EngineResultsTracker

* fix(scrapeURL): report unexpected error to sentry

* chore: remove unused modules

* feat(transfomers/coerce): warn when a format's response is missing

* feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support

* (add note)

* feat(scrapeURL): wip readme

* feat(scrapeURL): LLM extract

* feat(scrapeURL): better warnings

* fix(scrapeURL/engines/fire-engine;playwright): fix screenshot

* feat(scrapeURL): add forceEngine internal option

* feat(scrapeURL/engines): scrapingbee

* feat(scrapeURL/transformars): uploadScreenshot

* feat(scrapeURL): more intense tests

* bunch of stuff

* get rid of WebScraper (mostly)

* adapt batch scrape

* add staging deploy workflow

* fix yaml

* fix logger issues

* fix v1 test schema

* feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions

* scrapeURL: v0 backwards compat

* logger fixes

* feat(scrapeurl): v0 returnOnlyUrls support

* fix(scrapeURL/v0): URL leniency

* fix(batch-scrape): ts non-nullable

* fix(scrapeURL/fire-engine/chromecdp): fix wait action

* fix(logger): remove error debug key

* feat(requests.http): use dotenv expression

* fix(scrapeURL/extractMetadata): extract custom metadata

* fix crawl option conversion

* feat(scrapeURL): Add retry logic to robustFetch

* fix(scrapeURL): crawl stuff

* fix(scrapeURL): LLM extract

* fix(scrapeURL/v0): search fix

* fix(tests/v0): grant larger response size to v0 crawl status

* feat(scrapeURL): basic fetch engine

* feat(scrapeURL): playwright engine

* feat(scrapeURL): add url-specific parameters

* Update readme and examples

* added e2e tests for most parameters. Still a few actions, location and iframes to be done.

* fixed type

* Nick:

* Update scrape.ts

* Update index.ts

* added actions and base64 check

* Nick: skipTls feature flag?

* 403

* todo

* todo

* fixes

* yeet headers from url specific params

* add warning when final engine has feature deficit

* expose engine results tracker for ScrapeEvents implementation

* ingest scrape events

* fixed some tests

* comment

* Update index.test.ts

* fixed rawHtml

* Update index.test.ts

* update comments

* move geolocation to global f-e option, fix removeBase64Images

* Nick:

* trim url-specific params

* Update index.ts

---------

Co-authored-by: Eric Ciarla <ericciarla@yahoo.com>
Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com>
Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
Gergő Móricz 2024-11-07 20:57:33 +01:00 committed by GitHub
parent ed5a0d3cf2
commit 8d467c8ca7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
142 changed files with 4230 additions and 6334 deletions

View File

@ -8,7 +8,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }} HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -21,7 +20,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1 HDX_NODE_BETA_MODE: 1
jobs: jobs:

View File

@ -8,7 +8,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }} HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -21,7 +20,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1 HDX_NODE_BETA_MODE: 1
jobs: jobs:

View File

@ -8,7 +8,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }} HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -21,7 +20,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1 HDX_NODE_BETA_MODE: 1

View File

@ -12,7 +12,6 @@ env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
HOST: ${{ secrets.HOST }} HOST: ${{ secrets.HOST }}
LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }}
LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }}
POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }}
NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }}
@ -25,7 +24,6 @@ env:
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1 HDX_NODE_BETA_MODE: 1
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }} FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}

View File

@ -0,0 +1,32 @@
name: STAGING Deploy Images to GHCR
env:
DOTNET_VERSION: '6.0.x'
on:
push:
branches:
- mog/webscraper-refactor
workflow_dispatch:
jobs:
push-app-image:
runs-on: ubuntu-latest
defaults:
run:
working-directory: './apps/api'
steps:
- name: 'Checkout GitHub Action'
uses: actions/checkout@main
- name: 'Login to GitHub Container Registry'
uses: docker/login-action@v1
with:
registry: ghcr.io
username: ${{github.actor}}
password: ${{secrets.GITHUB_TOKEN}}
- name: 'Build Inventory Image'
run: |
docker build . --tag ghcr.io/mendableai/firecrawl-staging:latest
docker push ghcr.io/mendableai/firecrawl-staging:latest

View File

@ -41,7 +41,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking
OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.)
BULL_AUTH_KEY= @ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages

View File

@ -62,7 +62,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real
SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper
OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation) OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation)
BULL_AUTH_KEY= @ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages

View File

@ -33,8 +33,6 @@ SCRAPING_BEE_API_KEY=
# add for LLM dependednt features (image alt generation, etc.) # add for LLM dependednt features (image alt generation, etc.)
OPENAI_API_KEY= OPENAI_API_KEY=
BULL_AUTH_KEY=@ BULL_AUTH_KEY=@
# use if you're configuring basic logging with logtail
LOGTAIL_KEY=
# set if you have a llamaparse key you'd like to use to parse pdfs # set if you have a llamaparse key you'd like to use to parse pdfs
LLAMAPARSE_API_KEY= LLAMAPARSE_API_KEY=
# set if you'd like to send slack server health status messages # set if you'd like to send slack server health status messages
@ -54,9 +52,6 @@ STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
STRIPE_PRICE_ID_GROWTH= STRIPE_PRICE_ID_GROWTH=
STRIPE_PRICE_ID_GROWTH_YEARLY= STRIPE_PRICE_ID_GROWTH_YEARLY=
HYPERDX_API_KEY=
HDX_NODE_BETA_MODE=1
# set if you'd like to use the fire engine closed beta # set if you'd like to use the fire engine closed beta
FIRE_ENGINE_BETA_URL= FIRE_ENGINE_BETA_URL=

View File

@ -1 +1 @@
global.fetch = require('jest-fetch-mock'); // global.fetch = require('jest-fetch-mock');

View File

@ -32,9 +32,11 @@
"@tsconfig/recommended": "^1.0.3", "@tsconfig/recommended": "^1.0.3",
"@types/body-parser": "^1.19.2", "@types/body-parser": "^1.19.2",
"@types/cors": "^2.8.13", "@types/cors": "^2.8.13",
"@types/escape-html": "^1.0.4",
"@types/express": "^4.17.17", "@types/express": "^4.17.17",
"@types/jest": "^29.5.12", "@types/jest": "^29.5.12",
"@types/node": "^20.14.1", "@types/node": "^20.14.1",
"@types/pdf-parse": "^1.1.4",
"body-parser": "^1.20.1", "body-parser": "^1.20.1",
"express": "^4.18.2", "express": "^4.18.2",
"jest": "^29.6.3", "jest": "^29.6.3",
@ -53,9 +55,7 @@
"@bull-board/api": "^5.20.5", "@bull-board/api": "^5.20.5",
"@bull-board/express": "^5.20.5", "@bull-board/express": "^5.20.5",
"@devil7softwares/pos": "^1.0.2", "@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.13", "@dqbd/tiktoken": "^1.0.16",
"@hyperdx/node-opentelemetry": "^0.8.1",
"@logtail/node": "^0.4.12",
"@nangohq/node": "^0.40.8", "@nangohq/node": "^0.40.8",
"@sentry/cli": "^2.33.1", "@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0", "@sentry/node": "^8.26.0",
@ -78,6 +78,7 @@
"date-fns": "^3.6.0", "date-fns": "^3.6.0",
"dotenv": "^16.3.1", "dotenv": "^16.3.1",
"dotenv-cli": "^7.4.2", "dotenv-cli": "^7.4.2",
"escape-html": "^1.0.3",
"express-rate-limit": "^7.3.1", "express-rate-limit": "^7.3.1",
"express-ws": "^5.0.2", "express-ws": "^5.0.2",
"form-data": "^4.0.0", "form-data": "^4.0.0",
@ -92,6 +93,7 @@
"languagedetect": "^2.0.0", "languagedetect": "^2.0.0",
"logsnag": "^1.0.0", "logsnag": "^1.0.0",
"luxon": "^3.4.3", "luxon": "^3.4.3",
"marked": "^14.1.2",
"md5": "^2.3.0", "md5": "^2.3.0",
"moment": "^2.29.4", "moment": "^2.29.4",
"mongoose": "^8.4.4", "mongoose": "^8.4.4",
@ -114,6 +116,8 @@
"typesense": "^1.5.4", "typesense": "^1.5.4",
"unstructured-client": "^0.11.3", "unstructured-client": "^0.11.3",
"uuid": "^10.0.0", "uuid": "^10.0.0",
"winston": "^3.14.2",
"winston-transport": "^4.8.0",
"wordpos": "^2.1.0", "wordpos": "^2.1.0",
"ws": "^8.18.0", "ws": "^8.18.0",
"xml2js": "^0.6.2", "xml2js": "^0.6.2",

File diff suppressed because it is too large Load Diff

View File

@ -1,15 +1,15 @@
### Crawl Website ### Crawl Website
POST http://localhost:3002/v0/scrape HTTP/1.1 POST http://localhost:3002/v0/scrape HTTP/1.1
Authorization: Bearer fc- Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json content-type: application/json
{ {
"url":"corterix.com" "url":"firecrawl.dev"
} }
### Check Job Status ### Check Job Status
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1 GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
Authorization: Bearer fc- Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Check Job Status ### Check Job Status
@ -18,7 +18,7 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
### Scrape Website ### Scrape Website
POST http://localhost:3002/v0/crawl HTTP/1.1 POST http://localhost:3002/v0/crawl HTTP/1.1
Authorization: Bearer fc- Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json content-type: application/json
{ {
@ -45,7 +45,7 @@ content-type: application/json
### Scrape Website ### Scrape Website
POST http://localhost:3002/v0/scrape HTTP/1.1 POST http://localhost:3002/v0/scrape HTTP/1.1
Authorization: Bearer Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json content-type: application/json
{ {
@ -56,12 +56,12 @@ content-type: application/json
### Check Job Status ### Check Job Status
GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1 GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1
Authorization: Bearer Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Get Job Result ### Get Job Result
POST https://api.firecrawl.dev/v0/crawl HTTP/1.1 POST https://api.firecrawl.dev/v0/crawl HTTP/1.1
Authorization: Bearer Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json content-type: application/json
{ {
@ -70,7 +70,7 @@ content-type: application/json
### Check Job Status ### Check Job Status
GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66 GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66
Authorization: Bearer Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Get Active Jobs Count ### Get Active Jobs Count
GET http://localhost:3002/serverHealthCheck GET http://localhost:3002/serverHealthCheck

View File

@ -0,0 +1,2 @@
html-to-markdown.so
html-to-markdown.h

View File

@ -844,7 +844,7 @@ describe("E2E Tests for API Routes", () => {
expect(crawlInitResponse.statusCode).toBe(200); expect(crawlInitResponse.statusCode).toBe(200);
expect(crawlInitResponse.body).toHaveProperty("jobId"); expect(crawlInitResponse.body).toHaveProperty("jobId");
let crawlStatus: string; let crawlStatus: string = "scraping";
let crawlData = []; let crawlData = [];
while (crawlStatus !== "completed") { while (crawlStatus !== "completed") {
const statusResponse = await request(TEST_URL) const statusResponse = await request(TEST_URL)

View File

@ -20,7 +20,6 @@ describe("E2E Tests for API Routes with No Authentication", () => {
process.env.SCRAPING_BEE_API_KEY = ""; process.env.SCRAPING_BEE_API_KEY = "";
process.env.OPENAI_API_KEY = ""; process.env.OPENAI_API_KEY = "";
process.env.BULL_AUTH_KEY = ""; process.env.BULL_AUTH_KEY = "";
process.env.LOGTAIL_KEY = "";
process.env.PLAYWRIGHT_MICROSERVICE_URL = ""; process.env.PLAYWRIGHT_MICROSERVICE_URL = "";
process.env.LLAMAPARSE_API_KEY = ""; process.env.LLAMAPARSE_API_KEY = "";
process.env.TEST_API_KEY = ""; process.env.TEST_API_KEY = "";

View File

@ -1,7 +1,7 @@
import request from "supertest"; import request from "supertest";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { import {
ScrapeRequest, ScrapeRequestInput,
ScrapeResponseRequestTest, ScrapeResponseRequestTest,
} from "../../controllers/v1/types"; } from "../../controllers/v1/types";
@ -44,7 +44,7 @@ describe("E2E Tests for v1 API Routes", () => {
}); });
it.concurrent("should throw error for blocklisted URL", async () => { it.concurrent("should throw error for blocklisted URL", async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://facebook.com/fake-test", url: "https://facebook.com/fake-test",
}; };
@ -73,7 +73,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent( it.concurrent(
"should return a successful response with a valid API key", "should return a successful response with a valid API key",
async () => { async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai", url: "https://roastmywebsite.ai",
}; };
@ -125,7 +125,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent( it.concurrent(
"should return a successful response with a valid API key", "should return a successful response with a valid API key",
async () => { async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://arxiv.org/abs/2410.04840", url: "https://arxiv.org/abs/2410.04840",
}; };
@ -167,7 +167,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent( it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true", "should return a successful response with a valid API key and includeHtml set to true",
async () => { async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai", url: "https://roastmywebsite.ai",
formats: ["markdown", "html"], formats: ["markdown", "html"],
}; };
@ -194,7 +194,7 @@ describe("E2E Tests for v1 API Routes", () => {
30000 30000
); );
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => { it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
// formats: ["markdown", "html"], // formats: ["markdown", "html"],
}; };
@ -217,7 +217,7 @@ describe("E2E Tests for v1 API Routes", () => {
}, 60000); }, 60000);
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://arxiv.org/pdf/astro-ph/9301001" url: "https://arxiv.org/pdf/astro-ph/9301001"
}; };
const response: ScrapeResponseRequestTest = await request(TEST_URL) const response: ScrapeResponseRequestTest = await request(TEST_URL)
@ -240,7 +240,7 @@ describe("E2E Tests for v1 API Routes", () => {
}, 60000); }, 60000);
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://www.scrapethissite.com/", url: "https://www.scrapethissite.com/",
onlyMainContent: false // default is true onlyMainContent: false // default is true
}; };
@ -261,7 +261,7 @@ describe("E2E Tests for v1 API Routes", () => {
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
const scrapeRequestWithRemoveTags: ScrapeRequest = { const scrapeRequestWithRemoveTags: ScrapeRequestInput = {
url: "https://www.scrapethissite.com/", url: "https://www.scrapethissite.com/",
excludeTags: ['.nav', '#footer', 'strong'], excludeTags: ['.nav', '#footer', 'strong'],
onlyMainContent: false // default is true onlyMainContent: false // default is true
@ -407,7 +407,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent( it.concurrent(
"should return a successful response with a valid API key and includeHtml set to true", "should return a successful response with a valid API key and includeHtml set to true",
async () => { async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai", url: "https://roastmywebsite.ai",
formats: ["html","rawHtml"], formats: ["html","rawHtml"],
}; };
@ -438,7 +438,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent( it.concurrent(
"should return a successful response with waitFor", "should return a successful response with waitFor",
async () => { async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://ycombinator.com/companies", url: "https://ycombinator.com/companies",
formats: ["markdown"], formats: ["markdown"],
waitFor: 8000 waitFor: 8000
@ -471,7 +471,7 @@ describe("E2E Tests for v1 API Routes", () => {
it.concurrent( it.concurrent(
"should return a successful response with a valid links on page", "should return a successful response with a valid links on page",
async () => { async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://roastmywebsite.ai", url: "https://roastmywebsite.ai",
formats: ["links"], formats: ["links"],
}; };
@ -672,7 +672,7 @@ describe("POST /v1/crawl", () => {
}); });
it.concurrent("should throw error for blocklisted URL", async () => { it.concurrent("should throw error for blocklisted URL", async () => {
const scrapeRequest: ScrapeRequest = { const scrapeRequest: ScrapeRequestInput = {
url: "https://facebook.com/fake-test", url: "https://facebook.com/fake-test",
}; };

View File

@ -0,0 +1,603 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import {
ScrapeRequest,
ScrapeResponseRequestTest,
} from "../../controllers/v1/types";
configDotenv();
const FIRECRAWL_API_URL = "http://127.0.0.1:3002";
const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test
describe("E2E Tests for v1 API Routes", () => {
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post('/v1/scrape')
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
.set('Content-Type', 'application/json')
.send({ url: 'https://httpstat.us/403' });
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty('markdown');
expect(response.body.data).toHaveProperty('metadata');
expect(response.body.data.metadata.statusCode).toBe(403);
}, 30000);
it.concurrent("should handle 'formats:markdown (default)' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).toHaveProperty("markdown");
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
expect(response.body.data.markdown).toContain("Content with id #content-1");
// expect(response.body.data.markdown).toContain("Loading...");
expect(response.body.data.markdown).toContain("Click me!");
expect(response.body.data.markdown).toContain("Power your AI apps with clean data crawled from any website. It's also open-source."); // firecrawl.dev inside an iframe
expect(response.body.data.markdown).toContain("This content loads only when you see it. Don't blink! 👼"); // the browser always scroll to the bottom
expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default
expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default
expect(response.body.data.markdown).not.toContain("This content is only visible on mobile");
},
30000);
it.concurrent("should handle 'formats:html' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
formats: ["html"]
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("html");
expect(response.body.data.html).not.toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
expect(response.body.data.html).toContain("<p style=\"\">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
},
30000);
it.concurrent("should handle 'rawHtml' in 'formats' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
formats: ["rawHtml"]
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data).not.toHaveProperty("markdown");
expect(response.body.data).toHaveProperty("rawHtml");
expect(response.body.data.rawHtml).toContain(">This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
expect(response.body.data.rawHtml).toContain(">Header</header>");
},
30000);
// - TODO: tests for links
// - TODO: tests for screenshot
// - TODO: tests for screenshot@fullPage
it.concurrent("should handle 'headers' parameter correctly", async () => {
// @ts-ignore
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
headers: { "e2e-header-test": "firecrawl" }
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data.markdown).toContain("e2e-header-test: firecrawl");
}, 30000);
it.concurrent("should handle 'includeTags' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
includeTags: ['#content-1']
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data.markdown).not.toContain("<p>This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
expect(response.body.data.markdown).toContain("Content with id #content-1");
},
30000);
it.concurrent("should handle 'excludeTags' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
excludeTags: ['#content-1']
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
expect(response.body.data.markdown).not.toContain("Content with id #content-1");
},
30000);
it.concurrent("should handle 'onlyMainContent' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
formats: ["html", "markdown"],
onlyMainContent: false
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.");
expect(response.body.data.html).toContain("<header class=\"row-start-1\" style=\"\">Header</header>");
},
30000);
it.concurrent("should handle 'timeout' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
timeout: 500
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(408);
if (!("error" in response.body)) {
throw new Error("Expected response body to have 'error' property");
}
expect(response.body.error).toBe("Request timed out");
expect(response.body.success).toBe(false);
}, 30000);
it.concurrent("should handle 'mobile' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
mobile: true
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data.markdown).toContain("This content is only visible on mobile");
},
30000);
it.concurrent("should handle 'parsePDF' parameter correctly",
async () => {
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf'});
await new Promise((r) => setTimeout(r, 6000));
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty('data');
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data.markdown).toContain('arXiv:astro-ph/9301001v1 7 Jan 1993');
expect(response.body.data.markdown).not.toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
const responseNoParsePDF: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', parsePDF: false });
await new Promise((r) => setTimeout(r, 6000));
expect(responseNoParsePDF.statusCode).toBe(200);
expect(responseNoParsePDF.body).toHaveProperty('data');
if (!("data" in responseNoParsePDF.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(responseNoParsePDF.body.data.markdown).toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm');
},
30000);
// it.concurrent("should handle 'location' parameter correctly",
// async () => {
// const scrapeRequest: ScrapeRequest = {
// url: "https://roastmywebsite.ai",
// location: {
// country: "US",
// languages: ["en"]
// }
// };
// const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
// .post("/v1/scrape")
// .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
// .set("Content-Type", "application/json")
// .send(scrapeRequest);
// expect(response.statusCode).toBe(200);
// // Add assertions to verify location is handled correctly
// },
// 30000);
it.concurrent("should handle 'skipTlsVerification' parameter correctly",
async () => {
const scrapeRequest = {
url: "https://expired.badssl.com/",
timeout: 120000
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
console.log("Error1a")
// console.log(response.body)
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data.metadata.pageStatusCode).toBe(500);
console.log("Error?")
const scrapeRequestWithSkipTlsVerification = {
url: "https://expired.badssl.com/",
skipTlsVerification: true,
timeout: 120000
} as ScrapeRequest;
const responseWithSkipTlsVerification: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequestWithSkipTlsVerification);
console.log("Error1b")
// console.log(responseWithSkipTlsVerification.body)
expect(responseWithSkipTlsVerification.statusCode).toBe(200);
if (!("data" in responseWithSkipTlsVerification.body)) {
throw new Error("Expected response body to have 'data' property");
}
// console.log(responseWithSkipTlsVerification.body.data)
expect(responseWithSkipTlsVerification.body.data.markdown).toContain("badssl.com");
},
60000);
it.concurrent("should handle 'removeBase64Images' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
removeBase64Images: true
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
// console.log(response.body.data.markdown)
// - TODO: not working for every image
// expect(response.body.data.markdown).toContain("Image-Removed");
},
30000);
it.concurrent("should handle 'action wait' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
actions: [{
type: "wait",
milliseconds: 10000
}]
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data.markdown).not.toContain("Loading...");
expect(response.body.data.markdown).toContain("Content loaded after 5 seconds!");
},
30000);
// screenshot
it.concurrent("should handle 'action screenshot' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
actions: [{
type: "screenshot"
}]
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
if (!response.body.data.actions?.screenshots) {
throw new Error("Expected response body to have screenshots array");
}
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
// TODO compare screenshot with expected screenshot
},
30000);
it.concurrent("should handle 'action screenshot@fullPage' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
actions: [{
type: "screenshot",
fullPage: true
},
{
type:"scrape"
}]
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
// console.log(response.body.data.actions?.screenshots[0])
if (!response.body.data.actions?.screenshots) {
throw new Error("Expected response body to have screenshots array");
}
expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0);
expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-");
if (!response.body.data.actions?.scrapes) {
throw new Error("Expected response body to have scrapes array");
}
expect(response.body.data.actions.scrapes[0].url).toBe("https://firecrawl-e2e-test.vercel.app/");
expect(response.body.data.actions.scrapes[0].html).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.</p>");
// TODO compare screenshot with expected full page screenshot
},
30000);
it.concurrent("should handle 'action click' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
actions: [{
type: "click",
selector: "#click-me"
}]
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
expect(response.body.data.markdown).not.toContain("Click me!");
expect(response.body.data.markdown).toContain("Text changed after click!");
},
30000);
it.concurrent("should handle 'action write' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
formats: ["html"],
actions: [{
type: "click",
selector: "#input-1"
},
{
type: "write",
text: "Hello, world!"
}
]} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
expect(response.statusCode).toBe(200);
if (!("data" in response.body)) {
throw new Error("Expected response body to have 'data' property");
}
// TODO: fix this test (need to fix fire-engine first)
// uncomment the following line:
// expect(response.body.data.html).toContain("<input id=\"input-1\" type=\"text\" placeholder=\"Enter text here...\" style=\"padding:8px;margin:10px;border:1px solid #ccc;border-radius:4px;background-color:#000\" value=\"Hello, world!\">");
},
30000);
// TODO: fix this test (need to fix fire-engine first)
it.concurrent("should handle 'action pressKey' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
formats: ["markdown"],
actions: [
{
type: "press",
key: "ArrowDown"
}
]
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
// // TODO: fix this test (need to fix fire-engine first)
// // right now response.body is: { success: false, error: '(Internal server error) - null' }
// expect(response.statusCode).toBe(200);
// if (!("data" in response.body)) {
// throw new Error("Expected response body to have 'data' property");
// }
// expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown")
},
30000);
// TODO: fix this test (need to fix fire-engine first)
it.concurrent("should handle 'action scroll' parameter correctly",
async () => {
const scrapeRequest = {
url: E2E_TEST_SERVER_URL,
formats: ["markdown"],
actions: [
{
type: "click",
selector: "#scroll-bottom-loader"
},
{
type: "scroll",
direction: "down",
amount: 2000
}
]
} as ScrapeRequest;
const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(scrapeRequest);
// TODO: uncomment this tests
// expect(response.statusCode).toBe(200);
// if (!("data" in response.body)) {
// throw new Error("Expected response body to have 'data' property");
// }
//
// expect(response.body.data.markdown).toContain("You have reached the bottom!")
},
30000);
// TODO: test scrape action
});

View File

@ -776,7 +776,8 @@ describe("E2E Tests for v0 API Routes", () => {
await new Promise((r) => setTimeout(r, 10000)); await new Promise((r) => setTimeout(r, 10000));
const completedResponse = await request(TEST_URL) const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.maxResponseSize(4000000000);
expect(completedResponse.statusCode).toBe(200); expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body).toHaveProperty("status");

View File

@ -9,9 +9,8 @@ import {
import { supabase_service } from "../services/supabase"; import { supabase_service } from "../services/supabase";
import { withAuth } from "../lib/withAuth"; import { withAuth } from "../lib/withAuth";
import { RateLimiterRedis } from "rate-limiter-flexible"; import { RateLimiterRedis } from "rate-limiter-flexible";
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
import { sendNotification } from "../services/notification/email_notification"; import { sendNotification } from "../services/notification/email_notification";
import { Logger } from "../lib/logger"; import { logger } from "../lib/logger";
import { redlock } from "../services/redlock"; import { redlock } from "../services/redlock";
import { deleteKey, getValue } from "../services/redis"; import { deleteKey, getValue } from "../services/redis";
import { setValue } from "../services/redis"; import { setValue } from "../services/redis";
@ -40,8 +39,8 @@ function normalizedApiIsUuid(potentialUuid: string): boolean {
export async function setCachedACUC( export async function setCachedACUC(
api_key: string, api_key: string,
acuc: acuc:
| AuthCreditUsageChunk | AuthCreditUsageChunk | null
| ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk) | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null)
) { ) {
const cacheKeyACUC = `acuc_${api_key}`; const cacheKeyACUC = `acuc_${api_key}`;
const redLockKey = `lock_${cacheKeyACUC}`; const redLockKey = `lock_${cacheKeyACUC}`;
@ -49,7 +48,7 @@ export async function setCachedACUC(
try { try {
await redlock.using([redLockKey], 10000, {}, async (signal) => { await redlock.using([redLockKey], 10000, {}, async (signal) => {
if (typeof acuc === "function") { if (typeof acuc === "function") {
acuc = acuc(JSON.parse(await getValue(cacheKeyACUC))); acuc = acuc(JSON.parse(await getValue(cacheKeyACUC) ?? "null"));
if (acuc === null) { if (acuc === null) {
if (signal.aborted) { if (signal.aborted) {
@ -69,7 +68,7 @@ export async function setCachedACUC(
await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true); await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true);
}); });
} catch (error) { } catch (error) {
Logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`); logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`);
} }
} }
@ -103,7 +102,7 @@ export async function getACUC(
break; break;
} }
Logger.warn( logger.warn(
`Failed to retrieve authentication and credit usage data after ${retries}, trying again...` `Failed to retrieve authentication and credit usage data after ${retries}, trying again...`
); );
retries++; retries++;
@ -146,33 +145,14 @@ export async function authenticateUser(
res, res,
mode?: RateLimiterMode mode?: RateLimiterMode
): Promise<AuthResponse> { ): Promise<AuthResponse> {
return withAuth(supaAuthenticateUser)(req, res, mode); return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode);
}
function setTrace(team_id: string, api_key: string) {
try {
setTraceAttributes({
team_id,
api_key,
});
} catch (error) {
Sentry.captureException(error);
Logger.error(`Error setting trace attributes: ${error.message}`);
}
} }
export async function supaAuthenticateUser( export async function supaAuthenticateUser(
req, req,
res, res,
mode?: RateLimiterMode mode?: RateLimiterMode
): Promise<{ ): Promise<AuthResponse> {
success: boolean;
team_id?: string;
error?: string;
status?: number;
plan?: PlanType;
chunk?: AuthCreditUsageChunk;
}> {
const authHeader = const authHeader =
req.headers.authorization ?? req.headers.authorization ??
(req.headers["sec-websocket-protocol"] (req.headers["sec-websocket-protocol"]
@ -200,7 +180,7 @@ export async function supaAuthenticateUser(
let teamId: string | null = null; let teamId: string | null = null;
let priceId: string | null = null; let priceId: string | null = null;
let chunk: AuthCreditUsageChunk; let chunk: AuthCreditUsageChunk | null = null;
if (token == "this_is_just_a_preview_token") { if (token == "this_is_just_a_preview_token") {
if (mode == RateLimiterMode.CrawlStatus) { if (mode == RateLimiterMode.CrawlStatus) {
@ -233,8 +213,6 @@ export async function supaAuthenticateUser(
priceId = chunk.price_id; priceId = chunk.price_id;
const plan = getPlanByPriceId(priceId); const plan = getPlanByPriceId(priceId);
// HyperDX Logging
setTrace(teamId, normalizedApi);
subscriptionData = { subscriptionData = {
team_id: teamId, team_id: teamId,
plan, plan,
@ -291,7 +269,7 @@ export async function supaAuthenticateUser(
try { try {
await rateLimiter.consume(team_endpoint_token); await rateLimiter.consume(team_endpoint_token);
} catch (rateLimiterRes) { } catch (rateLimiterRes) {
Logger.error(`Rate limit exceeded: ${rateLimiterRes}`); logger.error(`Rate limit exceeded: ${rateLimiterRes}`);
const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1; const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext); const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
@ -318,7 +296,7 @@ export async function supaAuthenticateUser(
mode === RateLimiterMode.CrawlStatus || mode === RateLimiterMode.CrawlStatus ||
mode === RateLimiterMode.Search) mode === RateLimiterMode.Search)
) { ) {
return { success: true, team_id: "preview" }; return { success: true, team_id: "preview", chunk: null };
// check the origin of the request and make sure its from firecrawl.dev // check the origin of the request and make sure its from firecrawl.dev
// const origin = req.headers.origin; // const origin = req.headers.origin;
// if (origin && origin.includes("firecrawl.dev")){ // if (origin && origin.includes("firecrawl.dev")){
@ -333,12 +311,12 @@ export async function supaAuthenticateUser(
return { return {
success: true, success: true,
team_id: subscriptionData.team_id, team_id: teamId ?? undefined,
plan: (subscriptionData.plan ?? "") as PlanType, plan: (subscriptionData?.plan ?? "") as PlanType,
chunk, chunk,
}; };
} }
function getPlanByPriceId(price_id: string): PlanType { function getPlanByPriceId(price_id: string | null): PlanType {
switch (price_id) { switch (price_id) {
case process.env.STRIPE_PRICE_ID_STARTER: case process.env.STRIPE_PRICE_ID_STARTER:
return "starter"; return "starter";

View File

@ -1,7 +1,7 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { supabase_service } from "../../../services/supabase"; import { supabase_service } from "../../../services/supabase";
import { clearACUC } from "../../auth"; import { clearACUC } from "../../auth";
import { Logger } from "../../../lib/logger"; import { logger } from "../../../lib/logger";
export async function acucCacheClearController(req: Request, res: Response) { export async function acucCacheClearController(req: Request, res: Response) {
try { try {
@ -12,11 +12,11 @@ export async function acucCacheClearController(req: Request, res: Response) {
.select("*") .select("*")
.eq("team_id", team_id); .eq("team_id", team_id);
await Promise.all(keys.data.map((x) => clearACUC(x.key))); await Promise.all((keys.data ?? []).map((x) => clearACUC(x.key)));
res.json({ ok: true }); res.json({ ok: true });
} catch (error) { } catch (error) {
Logger.error(`Error clearing ACUC cache via API route: ${error}`); logger.error(`Error clearing ACUC cache via API route: ${error}`);
res.status(500).json({ error: "Internal server error" }); res.status(500).json({ error: "Internal server error" });
} }
} }

View File

@ -1,7 +1,7 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { Job } from "bullmq"; import { Job } from "bullmq";
import { Logger } from "../../../lib/logger"; import { logger } from "../../../lib/logger";
import { getScrapeQueue } from "../../../services/queue-service"; import { getScrapeQueue } from "../../../services/queue-service";
import { checkAlerts } from "../../../services/alerts"; import { checkAlerts } from "../../../services/alerts";
import { sendSlackWebhook } from "../../../services/alerts/slack"; import { sendSlackWebhook } from "../../../services/alerts/slack";
@ -10,7 +10,7 @@ export async function cleanBefore24hCompleteJobsController(
req: Request, req: Request,
res: Response res: Response
) { ) {
Logger.info("🐂 Cleaning jobs older than 24h"); logger.info("🐂 Cleaning jobs older than 24h");
try { try {
const scrapeQueue = getScrapeQueue(); const scrapeQueue = getScrapeQueue();
const batchSize = 10; const batchSize = 10;
@ -31,7 +31,7 @@ export async function cleanBefore24hCompleteJobsController(
).flat(); ).flat();
const before24hJobs = const before24hJobs =
completedJobs.filter( completedJobs.filter(
(job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 (job) => job.finishedOn !== undefined && job.finishedOn < Date.now() - 24 * 60 * 60 * 1000
) || []; ) || [];
let count = 0; let count = 0;
@ -45,12 +45,12 @@ export async function cleanBefore24hCompleteJobsController(
await job.remove(); await job.remove();
count++; count++;
} catch (jobError) { } catch (jobError) {
Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`); logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`);
} }
} }
return res.status(200).send(`Removed ${count} completed jobs.`); return res.status(200).send(`Removed ${count} completed jobs.`);
} catch (error) { } catch (error) {
Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`); logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`);
return res.status(500).send("Failed to clean jobs"); return res.status(500).send("Failed to clean jobs");
} }
} }
@ -60,7 +60,7 @@ export async function checkQueuesController(req: Request, res: Response) {
await checkAlerts(); await checkAlerts();
return res.status(200).send("Alerts initialized"); return res.status(200).send("Alerts initialized");
} catch (error) { } catch (error) {
Logger.debug(`Failed to initialize alerts: ${error}`); logger.debug(`Failed to initialize alerts: ${error}`);
return res.status(500).send("Failed to initialize alerts"); return res.status(500).send("Failed to initialize alerts");
} }
} }
@ -81,7 +81,7 @@ export async function queuesController(req: Request, res: Response) {
noActiveJobs, noActiveJobs,
}); });
} catch (error) { } catch (error) {
Logger.error(error); logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
@ -165,7 +165,7 @@ export async function autoscalerController(req: Request, res: Response) {
} }
if (targetMachineCount !== activeMachines) { if (targetMachineCount !== activeMachines) {
Logger.info( logger.info(
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting` `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
); );
@ -193,7 +193,7 @@ export async function autoscalerController(req: Request, res: Response) {
count: activeMachines, count: activeMachines,
}); });
} catch (error) { } catch (error) {
Logger.error(error); logger.error(error);
return res.status(500).send("Failed to initialize autoscaler"); return res.status(500).send("Failed to initialize autoscaler");
} }
} }

View File

@ -1,6 +1,6 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import Redis from "ioredis"; import Redis from "ioredis";
import { Logger } from "../../../lib/logger"; import { logger } from "../../../lib/logger";
import { redisRateLimitClient } from "../../../services/rate-limiter"; import { redisRateLimitClient } from "../../../services/rate-limiter";
export async function redisHealthController(req: Request, res: Response) { export async function redisHealthController(req: Request, res: Response) {
@ -10,14 +10,14 @@ export async function redisHealthController(req: Request, res: Response) {
return await operation(); return await operation();
} catch (error) { } catch (error) {
if (attempt === retries) throw error; if (attempt === retries) throw error;
Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`); logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`);
await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying
} }
} }
}; };
try { try {
const queueRedis = new Redis(process.env.REDIS_URL); const queueRedis = new Redis(process.env.REDIS_URL!);
const testKey = "test"; const testKey = "test";
const testValue = "test"; const testValue = "test";
@ -29,7 +29,7 @@ export async function redisHealthController(req: Request, res: Response) {
queueRedisHealth = await retryOperation(() => queueRedis.get(testKey)); queueRedisHealth = await retryOperation(() => queueRedis.get(testKey));
await retryOperation(() => queueRedis.del(testKey)); await retryOperation(() => queueRedis.del(testKey));
} catch (error) { } catch (error) {
Logger.error(`queueRedis health check failed: ${error}`); logger.error(`queueRedis health check failed: ${error}`);
queueRedisHealth = null; queueRedisHealth = null;
} }
@ -42,7 +42,7 @@ export async function redisHealthController(req: Request, res: Response) {
); );
await retryOperation(() => redisRateLimitClient.del(testKey)); await retryOperation(() => redisRateLimitClient.del(testKey));
} catch (error) { } catch (error) {
Logger.error(`redisRateLimitClient health check failed: ${error}`); logger.error(`redisRateLimitClient health check failed: ${error}`);
redisRateLimitHealth = null; redisRateLimitHealth = null;
} }
@ -56,10 +56,10 @@ export async function redisHealthController(req: Request, res: Response) {
healthStatus.queueRedis === "healthy" && healthStatus.queueRedis === "healthy" &&
healthStatus.redisRateLimitClient === "healthy" healthStatus.redisRateLimitClient === "healthy"
) { ) {
Logger.info("Both Redis instances are healthy"); logger.info("Both Redis instances are healthy");
return res.status(200).json({ status: "healthy", details: healthStatus }); return res.status(200).json({ status: "healthy", details: healthStatus });
} else { } else {
Logger.info( logger.info(
`Redis instances health check: ${JSON.stringify(healthStatus)}` `Redis instances health check: ${JSON.stringify(healthStatus)}`
); );
// await sendSlackWebhook( // await sendSlackWebhook(
@ -73,7 +73,7 @@ export async function redisHealthController(req: Request, res: Response) {
.json({ status: "unhealthy", details: healthStatus }); .json({ status: "unhealthy", details: healthStatus });
} }
} catch (error) { } catch (error) {
Logger.error(`Redis health check failed: ${error}`); logger.error(`Redis health check failed: ${error}`);
// await sendSlackWebhook( // await sendSlackWebhook(
// `[REDIS DOWN] Redis instances health check: ${error.message}`, // `[REDIS DOWN] Redis instances health check: ${error.message}`,
// true // true

View File

@ -2,7 +2,7 @@ import { Request, Response } from "express";
import { authenticateUser } from "../auth"; import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types"; import { RateLimiterMode } from "../../../src/types";
import { supabase_service } from "../../../src/services/supabase"; import { supabase_service } from "../../../src/services/supabase";
import { Logger } from "../../../src/lib/logger"; import { logger } from "../../../src/lib/logger";
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis"; import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
@ -12,15 +12,17 @@ export async function crawlCancelController(req: Request, res: Response) {
try { try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
const { success, team_id, error, status } = await authenticateUser( const auth = await authenticateUser(
req, req,
res, res,
RateLimiterMode.CrawlStatus RateLimiterMode.CrawlStatus
); );
if (!success) { if (!auth.success) {
return res.status(status).json({ error }); return res.status(auth.status).json({ error: auth.error });
} }
const { team_id } = auth;
const sc = await getCrawl(req.params.jobId); const sc = await getCrawl(req.params.jobId);
if (!sc) { if (!sc) {
return res.status(404).json({ error: "Job not found" }); return res.status(404).json({ error: "Job not found" });
@ -46,7 +48,7 @@ export async function crawlCancelController(req: Request, res: Response) {
sc.cancelled = true; sc.cancelled = true;
await saveCrawl(req.params.jobId, sc); await saveCrawl(req.params.jobId, sc);
} catch (error) { } catch (error) {
Logger.error(error); logger.error(error);
} }
res.json({ res.json({
@ -54,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) {
}); });
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }

View File

@ -2,15 +2,17 @@ import { Request, Response } from "express";
import { authenticateUser } from "../auth"; import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types"; import { RateLimiterMode } from "../../../src/types";
import { getScrapeQueue } from "../../../src/services/queue-service"; import { getScrapeQueue } from "../../../src/services/queue-service";
import { Logger } from "../../../src/lib/logger"; import { logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs"; import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { Job } from "bullmq";
import { toLegacyDocument } from "../v1/types";
configDotenv(); configDotenv();
export async function getJobs(crawlId: string, ids: string[]) { export async function getJobs(crawlId: string, ids: string[]) {
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as Job[];
if (process.env.USE_DB_AUTHENTICATION === "true") { if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsByCrawlId(crawlId); const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
@ -32,15 +34,17 @@ export async function getJobs(crawlId: string, ids: string[]) {
export async function crawlStatusController(req: Request, res: Response) { export async function crawlStatusController(req: Request, res: Response) {
try { try {
const { success, team_id, error, status } = await authenticateUser( const auth = await authenticateUser(
req, req,
res, res,
RateLimiterMode.CrawlStatus RateLimiterMode.CrawlStatus
); );
if (!success) { if (!auth.success) {
return res.status(status).json({ error }); return res.status(auth.status).json({ error: auth.error });
} }
const { team_id } = auth;
const sc = await getCrawl(req.params.jobId); const sc = await getCrawl(req.params.jobId);
if (!sc) { if (!sc) {
return res.status(404).json({ error: "Job not found" }); return res.status(404).json({ error: "Job not found" });
@ -90,12 +94,12 @@ export async function crawlStatusController(req: Request, res: Response) {
status: jobStatus, status: jobStatus,
current: jobStatuses.filter(x => x === "completed" || x === "failed").length, current: jobStatuses.filter(x => x === "completed" || x === "failed").length,
total: jobs.length, total: jobs.length,
data: jobStatus === "completed" ? data : null, data: jobStatus === "completed" ? data.map(x => toLegacyDocument(x, sc.internalOptions)) : null,
partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null), partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null).map(x => toLegacyDocument(x, sc.internalOptions)),
}); });
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }

View File

@ -9,24 +9,28 @@ import { validateIdempotencyKey } from "../../../src/services/idempotency/valida
import { createIdempotencyKey } from "../../../src/services/idempotency/create"; import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger"; import { logger } from "../../../src/lib/logger";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis"; import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../../src/services/queue-service"; import { getScrapeQueue } from "../../../src/services/queue-service";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
import { ZodError } from "zod";
export async function crawlController(req: Request, res: Response) { export async function crawlController(req: Request, res: Response) {
try { try {
const { success, team_id, error, status, plan, chunk } = await authenticateUser( const auth = await authenticateUser(
req, req,
res, res,
RateLimiterMode.Crawl RateLimiterMode.Crawl
); );
if (!success) { if (!auth.success) {
return res.status(status).json({ error }); return res.status(auth.status).json({ error: auth.error });
} }
const { team_id, plan, chunk } = auth;
if (req.headers["x-idempotency-key"]) { if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req); const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) { if (!isIdempotencyValid) {
@ -35,7 +39,7 @@ export async function crawlController(req: Request, res: Response) {
try { try {
createIdempotencyKey(req); createIdempotencyKey(req);
} catch (error) { } catch (error) {
Logger.error(error); logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }
@ -77,7 +81,7 @@ export async function crawlController(req: Request, res: Response) {
// TODO: need to do this to v1 // TODO: need to do this to v1
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
let url = req.body.url; let url = urlSchema.parse(req.body.url);
if (!url) { if (!url) {
return res.status(400).json({ error: "Url is required" }); return res.status(400).json({ error: "Url is required" });
} }
@ -123,7 +127,7 @@ export async function crawlController(req: Request, res: Response) {
// documents: docs, // documents: docs,
// }); // });
// } catch (error) { // } catch (error) {
// Logger.error(error); // logger.error(error);
// return res.status(500).json({ error: error.message }); // return res.status(500).json({ error: error.message });
// } // }
// } // }
@ -132,10 +136,13 @@ export async function crawlController(req: Request, res: Response) {
await logCrawl(id, team_id); await logCrawl(id, team_id);
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: url, originUrl: url,
crawlerOptions, crawlerOptions,
pageOptions, scrapeOptions,
internalOptions,
team_id, team_id,
plan, plan,
createdAt: Date.now(), createdAt: Date.now(),
@ -170,10 +177,11 @@ export async function crawlController(req: Request, res: Response) {
data: { data: {
url, url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: crawlerOptions, crawlerOptions,
scrapeOptions,
internalOptions,
team_id, team_id,
plan, plan,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin, origin: req.body.origin ?? defaultOrigin,
crawl_id: id, crawl_id: id,
sitemapped: true, sitemapped: true,
@ -208,10 +216,11 @@ export async function crawlController(req: Request, res: Response) {
{ {
url, url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: crawlerOptions, crawlerOptions,
scrapeOptions,
internalOptions,
team_id, team_id,
plan, plan: plan!,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin, origin: req.body.origin ?? defaultOrigin,
crawl_id: id, crawl_id: id,
}, },
@ -226,7 +235,9 @@ export async function crawlController(req: Request, res: Response) {
res.json({ jobId: id }); res.json({ jobId: id });
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error instanceof ZodError
? "Invalid URL"
: error.message });
} }
} }

View File

@ -3,15 +3,16 @@ import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types"; import { RateLimiterMode } from "../../../src/types";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger"; import { logger } from "../../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis"; import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { addScrapeJob } from "../../../src/services/queue-jobs"; import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { fromLegacyScrapeOptions } from "../v1/types";
export async function crawlPreviewController(req: Request, res: Response) { export async function crawlPreviewController(req: Request, res: Response) {
try { try {
const { success, error, status, team_id:a, plan } = await authenticateUser( const auth = await authenticateUser(
req, req,
res, res,
RateLimiterMode.Preview RateLimiterMode.Preview
@ -19,10 +20,12 @@ export async function crawlPreviewController(req: Request, res: Response) {
const team_id = "preview"; const team_id = "preview";
if (!success) { if (!auth.success) {
return res.status(status).json({ error }); return res.status(auth.status).json({ error: auth.error });
} }
const { plan } = auth;
let url = req.body.url; let url = req.body.url;
if (!url) { if (!url) {
return res.status(400).json({ error: "Url is required" }); return res.status(400).json({ error: "Url is required" });
@ -71,7 +74,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
// documents: docs, // documents: docs,
// }); // });
// } catch (error) { // } catch (error) {
// Logger.error(error); // logger.error(error);
// return res.status(500).json({ error: error.message }); // return res.status(500).json({ error: error.message });
// } // }
// } // }
@ -84,10 +87,13 @@ export async function crawlPreviewController(req: Request, res: Response) {
robots = await this.getRobotsTxt(); robots = await this.getRobotsTxt();
} catch (_) {} } catch (_) {}
const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined);
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: url, originUrl: url,
crawlerOptions, crawlerOptions,
pageOptions, scrapeOptions,
internalOptions,
team_id, team_id,
plan, plan,
robots, robots,
@ -107,10 +113,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
await addScrapeJob({ await addScrapeJob({
url, url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id, team_id,
plan, plan: plan!,
pageOptions: pageOptions, crawlerOptions,
scrapeOptions,
internalOptions,
origin: "website-preview", origin: "website-preview",
crawl_id: id, crawl_id: id,
sitemapped: true, sitemapped: true,
@ -123,10 +130,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
await addScrapeJob({ await addScrapeJob({
url, url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id, team_id,
plan, plan: plan!,
pageOptions: pageOptions, crawlerOptions,
scrapeOptions,
internalOptions,
origin: "website-preview", origin: "website-preview",
crawl_id: id, crawl_id: id,
}, {}, jobId); }, {}, jobId);
@ -136,7 +144,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
res.json({ jobId: id }); res.json({ jobId: id });
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }

View File

@ -8,13 +8,14 @@ import { authenticateUser } from "../auth";
export const keyAuthController = async (req: Request, res: Response) => { export const keyAuthController = async (req: Request, res: Response) => {
try { try {
// make sure to authenticate user first, Bearer <token> // make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status } = await authenticateUser( const auth = await authenticateUser(
req, req,
res res
); );
if (!success) { if (!auth.success) {
return res.status(status).json({ error }); return res.status(auth.status).json({ error: auth.error });
} }
// if success, return success: true // if success, return success: true
return res.status(200).json({ success: true }); return res.status(200).json({ success: true });
} catch (error) { } catch (error) {

View File

@ -7,7 +7,7 @@ import {
import { authenticateUser } from "../auth"; import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types"; import { PlanType, RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { Document } from "../../lib/entities"; import { Document, fromLegacyCombo, toLegacyDocument, url as urlSchema } from "../v1/types";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import { import {
@ -19,9 +19,11 @@ import {
import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { getScrapeQueue } from "../../services/queue-service"; import { getScrapeQueue } from "../../services/queue-service";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../lib/logger"; import { logger } from "../../lib/logger";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { fromLegacyScrapeOptions } from "../v1/types";
import { ZodError } from "zod";
export async function scrapeHelper( export async function scrapeHelper(
jobId: string, jobId: string,
@ -35,10 +37,10 @@ export async function scrapeHelper(
): Promise<{ ): Promise<{
success: boolean; success: boolean;
error?: string; error?: string;
data?: Document; data?: Document | { url: string };
returnCode: number; returnCode: number;
}> { }> {
const url = req.body.url; const url = urlSchema.parse(req.body.url);
if (typeof url !== "string") { if (typeof url !== "string") {
return { success: false, error: "Url is required", returnCode: 400 }; return { success: false, error: "Url is required", returnCode: 400 };
} }
@ -54,15 +56,16 @@ export async function scrapeHelper(
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 }); const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, extractorOptions, timeout, crawlerOptions);
await addScrapeJob( await addScrapeJob(
{ {
url, url,
mode: "single_urls", mode: "single_urls",
crawlerOptions,
team_id, team_id,
pageOptions, scrapeOptions,
plan, internalOptions,
extractorOptions, plan: plan!,
origin: req.body.origin ?? defaultOrigin, origin: req.body.origin ?? defaultOrigin,
is_scrape: true, is_scrape: true,
}, },
@ -81,7 +84,7 @@ export async function scrapeHelper(
}, },
async (span) => { async (span) => {
try { try {
doc = (await waitForJob(jobId, timeout))[0]; doc = (await waitForJob<Document>(jobId, timeout));
} catch (e) { } catch (e) {
if (e instanceof Error && e.message.startsWith("Job wait")) { if (e instanceof Error && e.message.startsWith("Job wait")) {
span.setAttribute("timedOut", true); span.setAttribute("timedOut", true);
@ -149,7 +152,7 @@ export async function scrapeHelper(
return { return {
success: true, success: true,
data: doc, data: toLegacyDocument(doc, internalOptions),
returnCode: 200, returnCode: 200,
}; };
} }
@ -158,15 +161,17 @@ export async function scrapeController(req: Request, res: Response) {
try { try {
let earlyReturn = false; let earlyReturn = false;
// make sure to authenticate user first, Bearer <token> // make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status, plan, chunk } = await authenticateUser( const auth = await authenticateUser(
req, req,
res, res,
RateLimiterMode.Scrape RateLimiterMode.Scrape
); );
if (!success) { if (!auth.success) {
return res.status(status).json({ error }); return res.status(auth.status).json({ error: auth.error });
} }
const { team_id, plan, chunk } = auth;
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
const extractorOptions = { const extractorOptions = {
@ -200,7 +205,7 @@ export async function scrapeController(req: Request, res: Response) {
return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" }); return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" });
} }
} catch (error) { } catch (error) {
Logger.error(error); logger.error(error);
earlyReturn = true; earlyReturn = true;
return res.status(500).json({ return res.status(500).json({
error: error:
@ -224,8 +229,8 @@ export async function scrapeController(req: Request, res: Response) {
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens = const numTokens =
result.data && result.data.markdown result.data && (result.data as Document).markdown
? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") ? numTokensFromString((result.data as Document).markdown!, "gpt-3.5-turbo")
: 0; : 0;
if (result.success) { if (result.success) {
@ -246,7 +251,7 @@ export async function scrapeController(req: Request, res: Response) {
if (creditsToBeBilled > 0) { if (creditsToBeBilled > 0) {
// billing for doc done on queue end, bill only for llm extraction // billing for doc done on queue end, bill only for llm extraction
billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => { billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`); logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here // Optionally, you could notify an admin or add to a retry queue here
}); });
} }
@ -254,17 +259,19 @@ export async function scrapeController(req: Request, res: Response) {
let doc = result.data; let doc = result.data;
if (!pageOptions || !pageOptions.includeRawHtml) { if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && doc.rawHtml) { if (doc && (doc as Document).rawHtml) {
delete doc.rawHtml; delete (doc as Document).rawHtml;
} }
} }
if(pageOptions && pageOptions.includeExtract) { if(pageOptions && pageOptions.includeExtract) {
if(!pageOptions.includeMarkdown && doc && doc.markdown) { if(!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
delete doc.markdown; delete (doc as Document).markdown;
} }
} }
const { scrapeOptions } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
logJob({ logJob({
job_id: jobId, job_id: jobId,
success: result.success, success: result.success,
@ -276,21 +283,22 @@ export async function scrapeController(req: Request, res: Response) {
mode: "scrape", mode: "scrape",
url: req.body.url, url: req.body.url,
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions, scrapeOptions,
origin: origin, origin: origin,
extractor_options: extractorOptions,
num_tokens: numTokens, num_tokens: numTokens,
}); });
return res.status(result.returnCode).json(result); return res.status(result.returnCode).json(result);
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); logger.error(error);
return res.status(500).json({ return res.status(500).json({
error: error:
typeof error === "string" error instanceof ZodError
? error ? "Invalid URL"
: error?.message ?? "Internal Server Error", : typeof error === "string"
? error
: error?.message ?? "Internal Server Error",
}); });
} }
} }

View File

@ -1,5 +1,4 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { WebScraperDataProvider } from "../../scraper/WebScraper";
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
import { authenticateUser } from "../auth"; import { authenticateUser } from "../auth";
import { PlanType, RateLimiterMode } from "../../types"; import { PlanType, RateLimiterMode } from "../../types";
@ -8,21 +7,23 @@ import { PageOptions, SearchOptions } from "../../lib/entities";
import { search } from "../../search"; import { search } from "../../search";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../lib/logger"; import { logger } from "../../lib/logger";
import { getScrapeQueue } from "../../services/queue-service"; import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { Job } from "bullmq";
import { Document, fromLegacyCombo, fromLegacyScrapeOptions, toLegacyDocument } from "../v1/types";
export async function searchHelper( export async function searchHelper(
jobId: string, jobId: string,
req: Request, req: Request,
team_id: string, team_id: string,
subscription_id: string, subscription_id: string | null | undefined,
crawlerOptions: any, crawlerOptions: any,
pageOptions: PageOptions, pageOptions: PageOptions,
searchOptions: SearchOptions, searchOptions: SearchOptions,
plan: PlanType plan: PlanType | undefined
): Promise<{ ): Promise<{
success: boolean; success: boolean;
error?: string; error?: string;
@ -35,8 +36,8 @@ export async function searchHelper(
return { success: false, error: "Query is required", returnCode: 400 }; return { success: false, error: "Query is required", returnCode: 400 };
} }
const tbs = searchOptions.tbs ?? null; const tbs = searchOptions.tbs ?? undefined;
const filter = searchOptions.filter ?? null; const filter = searchOptions.filter ?? undefined;
let num_results = Math.min(searchOptions.limit ?? 7, 10); let num_results = Math.min(searchOptions.limit ?? 7, 10);
if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") { if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
@ -58,10 +59,11 @@ export async function searchHelper(
let justSearch = pageOptions.fetchPageContent === false; let justSearch = pageOptions.fetchPageContent === false;
const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, undefined, 60000, crawlerOptions);
if (justSearch) { if (justSearch) {
billTeam(team_id, subscription_id, res.length).catch(error => { billTeam(team_id, subscription_id, res.length).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`); logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here // Optionally, you could notify an admin or add to a retry queue here
}); });
return { success: true, data: res, returnCode: 200 }; return { success: true, data: res, returnCode: 200 };
@ -88,9 +90,9 @@ export async function searchHelper(
data: { data: {
url, url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id, team_id: team_id,
pageOptions: pageOptions, scrapeOptions,
internalOptions,
}, },
opts: { opts: {
jobId: uuid, jobId: uuid,
@ -104,7 +106,7 @@ export async function searchHelper(
await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority) await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority)
} }
const docs = (await Promise.all(jobDatas.map(x => waitForJob(x.opts.jobId, 60000)))).map(x => x[0]); const docs = (await Promise.all(jobDatas.map(x => waitForJob<Document>(x.opts.jobId, 60000)))).map(x => toLegacyDocument(x, internalOptions));
if (docs.length === 0) { if (docs.length === 0) {
return { success: true, error: "No search results found", returnCode: 200 }; return { success: true, error: "No search results found", returnCode: 200 };
@ -115,7 +117,7 @@ export async function searchHelper(
// make sure doc.content is not empty // make sure doc.content is not empty
const filteredDocs = docs.filter( const filteredDocs = docs.filter(
(doc: { content?: string }) => doc && doc.content && doc.content.trim().length > 0 (doc: any) => doc && doc.content && doc.content.trim().length > 0
); );
if (filteredDocs.length === 0) { if (filteredDocs.length === 0) {
@ -132,14 +134,15 @@ export async function searchHelper(
export async function searchController(req: Request, res: Response) { export async function searchController(req: Request, res: Response) {
try { try {
// make sure to authenticate user first, Bearer <token> // make sure to authenticate user first, Bearer <token>
const { success, team_id, error, status, plan, chunk } = await authenticateUser( const auth = await authenticateUser(
req, req,
res, res,
RateLimiterMode.Search RateLimiterMode.Search
); );
if (!success) { if (!auth.success) {
return res.status(status).json({ error }); return res.status(auth.status).json({ error: auth.error });
} }
const { team_id, plan, chunk } = auth;
const crawlerOptions = req.body.crawlerOptions ?? {}; const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { const pageOptions = req.body.pageOptions ?? {
includeHtml: req.body.pageOptions?.includeHtml ?? false, includeHtml: req.body.pageOptions?.includeHtml ?? false,
@ -162,7 +165,7 @@ export async function searchController(req: Request, res: Response) {
} }
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); logger.error(error);
return res.status(500).json({ error: "Internal server error" }); return res.status(500).json({ error: "Internal server error" });
} }
const startTime = new Date().getTime(); const startTime = new Date().getTime();
@ -189,7 +192,6 @@ export async function searchController(req: Request, res: Response) {
mode: "search", mode: "search",
url: req.body.query, url: req.body.query,
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
origin: origin, origin: origin,
}); });
return res.status(result.returnCode).json(result); return res.status(result.returnCode).json(result);
@ -199,7 +201,7 @@ export async function searchController(req: Request, res: Response) {
} }
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); logger.error("Unhandled error occurred in search", { error });
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }

View File

@ -1,5 +1,5 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { Logger } from "../../../src/lib/logger"; import { logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { getJobs } from "./crawl-status"; import { getJobs } from "./crawl-status";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
@ -37,7 +37,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
}); });
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }

View File

@ -4,8 +4,6 @@ import {
BatchScrapeRequest, BatchScrapeRequest,
batchScrapeRequestSchema, batchScrapeRequestSchema,
CrawlResponse, CrawlResponse,
legacyExtractorOptions,
legacyScrapeOptions,
RequestWithAuth, RequestWithAuth,
} from "./types"; } from "./types";
import { import {
@ -29,19 +27,16 @@ export async function batchScrapeController(
await logCrawl(id, req.auth.team_id); await logCrawl(id, req.auth.team_id);
let { remainingCredits } = req.account; let { remainingCredits } = req.account!;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if(!useDbAuthentication){ if(!useDbAuthentication){
remainingCredits = Infinity; remainingCredits = Infinity;
} }
const pageOptions = legacyScrapeOptions(req.body);
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
const sc: StoredCrawl = { const sc: StoredCrawl = {
crawlerOptions: null, crawlerOptions: null,
pageOptions, scrapeOptions: req.body,
internalOptions: {},
team_id: req.auth.team_id, team_id: req.auth.team_id,
createdAt: Date.now(), createdAt: Date.now(),
plan: req.auth.plan, plan: req.auth.plan,
@ -64,10 +59,9 @@ export async function batchScrapeController(
url: x, url: x,
mode: "single_urls" as const, mode: "single_urls" as const,
team_id: req.auth.team_id, team_id: req.auth.team_id,
plan: req.auth.plan, plan: req.auth.plan!,
crawlerOptions: null, crawlerOptions: null,
pageOptions, scrapeOptions: req.body,
extractorOptions,
origin: "api", origin: "api",
crawl_id: id, crawl_id: id,
sitemapped: true, sitemapped: true,

View File

@ -1,6 +1,6 @@
import { Response } from "express"; import { Response } from "express";
import { supabase_service } from "../../services/supabase"; import { supabase_service } from "../../services/supabase";
import { Logger } from "../../lib/logger"; import { logger } from "../../lib/logger";
import { getCrawl, saveCrawl } from "../../lib/crawl-redis"; import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
@ -36,7 +36,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
sc.cancelled = true; sc.cancelled = true;
await saveCrawl(req.params.jobId, sc); await saveCrawl(req.params.jobId, sc);
} catch (error) { } catch (error) {
Logger.error(error); logger.error(error);
} }
res.json({ res.json({
@ -44,7 +44,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string
}); });
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
} }

View File

@ -1,14 +1,15 @@
import { authMiddleware } from "../../routes/v1"; import { authMiddleware } from "../../routes/v1";
import { RateLimiterMode } from "../../types"; import { RateLimiterMode } from "../../types";
import { authenticateUser } from "../auth"; import { authenticateUser } from "../auth";
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types"; import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types";
import { WebSocket } from "ws"; import { WebSocket } from "ws";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../lib/logger"; import { logger } from "../../lib/logger";
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis"; import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service"; import { getScrapeQueue } from "../../services/queue-service";
import { getJob, getJobs } from "./crawl-status"; import { getJob, getJobs } from "./crawl-status";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { Job, JobState } from "bullmq";
type ErrorMessage = { type ErrorMessage = {
type: "error", type: "error",
@ -56,7 +57,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
return close(ws, 3003, { type: "error", error: "Forbidden" }); return close(ws, 3003, { type: "error", error: "Forbidden" });
} }
let doneJobIDs = []; let doneJobIDs: string[] = [];
let finished = false; let finished = false;
const loop = async () => { const loop = async () => {
@ -70,15 +71,14 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x)); const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)])); const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]); const newlyDoneJobIDs: string[] = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
const newlyDoneJobs: Job[] = (await Promise.all(newlyDoneJobIDs.map(x => getJob(x)))).filter(x => x !== undefined) as Job[]
for (const jobID of newlyDoneJobIDs) {
const job = await getJob(jobID);
for (const job of newlyDoneJobs) {
if (job.returnvalue) { if (job.returnvalue) {
send(ws, { send(ws, {
type: "document", type: "document",
data: legacyDocumentConverter(job.returnvalue), data: job.returnvalue,
}) })
} else { } else {
return close(ws, 3000, { type: "error", error: job.failedReason }); return close(ws, 3000, { type: "error", error: job.failedReason });
@ -100,8 +100,8 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
const throttledJobsSet = new Set(throttledJobs); const throttledJobsSet = new Set(throttledJobs);
const validJobStatuses = []; const validJobStatuses: [string, JobState | "unknown"][] = [];
const validJobIDs = []; const validJobIDs: string[] = [];
for (const [id, status] of jobStatuses) { for (const [id, status] of jobStatuses) {
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") { if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
@ -126,7 +126,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
completed: doneJobIDs.length, completed: doneJobIDs.length,
creditsUsed: jobIDs.length, creditsUsed: jobIDs.length,
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(), expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
data: data.map(x => legacyDocumentConverter(x)), data: data,
} }
}); });
@ -139,19 +139,21 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
// Basically just middleware and error wrapping // Basically just middleware and error wrapping
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) { export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
try { try {
const { success, team_id, error, status, plan } = await authenticateUser( const auth = await authenticateUser(
req, req,
null, null,
RateLimiterMode.CrawlStatus, RateLimiterMode.CrawlStatus,
); );
if (!success) { if (!auth.success) {
return close(ws, 3000, { return close(ws, 3000, {
type: "error", type: "error",
error, error: auth.error,
}); });
} }
const { team_id, plan } = auth;
req.auth = { team_id, plan }; req.auth = { team_id, plan };
await crawlStatusWS(ws, req); await crawlStatusWS(ws, req);
@ -170,7 +172,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
} }
} }
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose); logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
return close(ws, 1011, { return close(ws, 1011, {
type: "error", type: "error",
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id

View File

@ -1,9 +1,10 @@
import { Response } from "express"; import { Response } from "express";
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types"; import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types";
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis"; import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service"; import { getScrapeQueue } from "../../services/queue-service";
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs"; import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { Job, JobState } from "bullmq";
configDotenv(); configDotenv();
export async function getJob(id: string) { export async function getJob(id: string) {
@ -24,7 +25,7 @@ export async function getJob(id: string) {
} }
export async function getJobs(ids: string[]) { export async function getJobs(ids: string[]) {
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); const jobs: (Job & { id: string })[] = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as (Job & {id: string})[];
if (process.env.USE_DB_AUTHENTICATION === "true") { if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsById(ids); const supabaseData = await supabaseGetJobsById(ids);
@ -63,8 +64,8 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
const throttledJobsSet = new Set(throttledJobs); const throttledJobsSet = new Set(throttledJobs);
const validJobStatuses = []; const validJobStatuses: [string, JobState | "unknown"][] = [];
const validJobIDs = []; const validJobIDs: string[] = [];
for (const [id, status] of jobStatuses) { for (const [id, status] of jobStatuses) {
if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") { if (!throttledJobsSet.has(id) && status !== "failed" && status !== "unknown") {
@ -81,7 +82,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId); const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1); const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
let doneJobs = []; let doneJobs: Job[] = [];
if (end === undefined) { // determine 10 megabyte limit if (end === undefined) { // determine 10 megabyte limit
let bytes = 0; let bytes = 0;
@ -98,7 +99,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) { for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
const job = jobs[ii]; const job = jobs[ii];
doneJobs.push(job); doneJobs.push(job);
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length; bytes += JSON.stringify(job.returnvalue).length;
} }
} }
@ -122,7 +123,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
} }
if (data.length > 0) { if (data.length > 0) {
if (!doneJobs[0].data.pageOptions.includeRawHtml) { if (!doneJobs[0].data.scrapeOptions.formats.includes("rawHtml")) {
for (let ii = 0; ii < doneJobs.length; ii++) { for (let ii = 0; ii < doneJobs.length; ii++) {
if (data[ii]) { if (data[ii]) {
delete data[ii].rawHtml; delete data[ii].rawHtml;
@ -142,7 +143,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
? undefined ? undefined
: nextURL.href, : nextURL.href,
data: data.map(x => legacyDocumentConverter(x)), data: data,
}); });
} }

View File

@ -4,9 +4,8 @@ import {
CrawlRequest, CrawlRequest,
crawlRequestSchema, crawlRequestSchema,
CrawlResponse, CrawlResponse,
legacyCrawlerOptions,
legacyScrapeOptions,
RequestWithAuth, RequestWithAuth,
toLegacyCrawlerOptions,
} from "./types"; } from "./types";
import { import {
addCrawlJob, addCrawlJob,
@ -20,9 +19,10 @@ import {
import { logCrawl } from "../../services/logging/crawl_log"; import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service"; import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob } from "../../services/queue-jobs"; import { addScrapeJob } from "../../services/queue-jobs";
import { Logger } from "../../lib/logger"; import { logger } from "../../lib/logger";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
import { callWebhook } from "../../services/webhook"; import { callWebhook } from "../../services/webhook";
import { scrapeOptions as scrapeOptionsSchema } from "./types";
export async function crawlController( export async function crawlController(
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
@ -34,18 +34,22 @@ export async function crawlController(
await logCrawl(id, req.auth.team_id); await logCrawl(id, req.auth.team_id);
let { remainingCredits } = req.account; let { remainingCredits } = req.account!;
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if(!useDbAuthentication){ if(!useDbAuthentication){
remainingCredits = Infinity; remainingCredits = Infinity;
} }
const crawlerOptions = legacyCrawlerOptions(req.body); const crawlerOptions = {
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions); ...req.body,
url: undefined,
scrapeOptions: undefined,
};
const scrapeOptions = req.body.scrapeOptions;
// TODO: @rafa, is this right? copied from v0 // TODO: @rafa, is this right? copied from v0
if (Array.isArray(crawlerOptions.includes)) { if (Array.isArray(crawlerOptions.includePaths)) {
for (const x of crawlerOptions.includes) { for (const x of crawlerOptions.includePaths) {
try { try {
new RegExp(x); new RegExp(x);
} catch (e) { } catch (e) {
@ -54,8 +58,8 @@ export async function crawlController(
} }
} }
if (Array.isArray(crawlerOptions.excludes)) { if (Array.isArray(crawlerOptions.excludePaths)) {
for (const x of crawlerOptions.excludes) { for (const x of crawlerOptions.excludePaths) {
try { try {
new RegExp(x); new RegExp(x);
} catch (e) { } catch (e) {
@ -68,8 +72,9 @@ export async function crawlController(
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: req.body.url, originUrl: req.body.url,
crawlerOptions, crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
pageOptions, scrapeOptions,
internalOptions: {},
team_id: req.auth.team_id, team_id: req.auth.team_id,
createdAt: Date.now(), createdAt: Date.now(),
plan: req.auth.plan, plan: req.auth.plan,
@ -78,9 +83,9 @@ export async function crawlController(
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
try { try {
sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification); sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
} catch (e) { } catch (e) {
Logger.debug( logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify( `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
e e
)}` )}`
@ -112,7 +117,7 @@ export async function crawlController(
team_id: req.auth.team_id, team_id: req.auth.team_id,
plan: req.auth.plan, plan: req.auth.plan,
crawlerOptions, crawlerOptions,
pageOptions, scrapeOptions,
origin: "api", origin: "api",
crawl_id: id, crawl_id: id,
sitemapped: true, sitemapped: true,
@ -142,10 +147,10 @@ export async function crawlController(
{ {
url: req.body.url, url: req.body.url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: req.auth.team_id, team_id: req.auth.team_id,
plan: req.auth.plan, crawlerOptions,
pageOptions: pageOptions, scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
plan: req.auth.plan!,
origin: "api", origin: "api",
crawl_id: id, crawl_id: id,
webhook: req.body.webhook, webhook: req.body.webhook,

View File

@ -1,9 +1,9 @@
import { Response } from "express"; import { Response } from "express";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { import {
legacyCrawlerOptions,
mapRequestSchema, mapRequestSchema,
RequestWithAuth, RequestWithAuth,
scrapeOptions,
} from "./types"; } from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types"; import { MapResponse, MapRequest } from "./types";
@ -18,11 +18,11 @@ import { fireEngineMap } from "../../search/fireEngine";
import { billTeam } from "../../services/billing/credit_billing"; import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { performCosineSimilarity } from "../../lib/map-cosine"; import { performCosineSimilarity } from "../../lib/map-cosine";
import { Logger } from "../../lib/logger"; import { logger } from "../../lib/logger";
import Redis from "ioredis"; import Redis from "ioredis";
configDotenv(); configDotenv();
const redis = new Redis(process.env.REDIS_URL); const redis = new Redis(process.env.REDIS_URL!);
// Max Links that /map can return // Max Links that /map can return
const MAX_MAP_LIMIT = 5000; const MAX_MAP_LIMIT = 5000;
@ -44,8 +44,12 @@ export async function mapController(
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: req.body.url, originUrl: req.body.url,
crawlerOptions: legacyCrawlerOptions(req.body), crawlerOptions: {
pageOptions: {}, ...req.body,
scrapeOptions: undefined,
},
scrapeOptions: scrapeOptions.parse({}),
internalOptions: {},
team_id: req.auth.team_id, team_id: req.auth.team_id,
createdAt: Date.now(), createdAt: Date.now(),
plan: req.auth.plan, plan: req.auth.plan,
@ -65,8 +69,8 @@ export async function mapController(
const cacheKey = `fireEngineMap:${mapUrl}`; const cacheKey = `fireEngineMap:${mapUrl}`;
const cachedResult = null; const cachedResult = null;
let allResults: any[]; let allResults: any[] = [];
let pagePromises: Promise<any>[]; let pagePromises: Promise<any>[] = [];
if (cachedResult) { if (cachedResult) {
allResults = JSON.parse(cachedResult); allResults = JSON.parse(cachedResult);
@ -139,7 +143,7 @@ export async function mapController(
return null; return null;
} }
}) })
.filter((x) => x !== null); .filter((x) => x !== null) as string[];
// allows for subdomains to be included // allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.url)); links = links.filter((x) => isSameDomain(x, req.body.url));
@ -153,7 +157,7 @@ export async function mapController(
links = removeDuplicateUrls(links); links = removeDuplicateUrls(links);
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => { billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
Logger.error( logger.error(
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}` `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
); );
// Optionally, you could notify an admin or add to a retry queue here // Optionally, you could notify an admin or add to a retry queue here
@ -175,9 +179,8 @@ export async function mapController(
mode: "map", mode: "map",
url: req.body.url, url: req.body.url,
crawlerOptions: {}, crawlerOptions: {},
pageOptions: {}, scrapeOptions: {},
origin: req.body.origin, origin: req.body.origin,
extractor_options: { mode: "markdown" },
num_tokens: 0, num_tokens: 0,
}); });

View File

@ -12,7 +12,7 @@ export async function scrapeStatusController(req: any, res: any) {
const job = await supabaseGetJobByIdOnlyData(req.params.jobId); const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){ if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
return res.status(403).json({ return res.status(403).json({
success: false, success: false,
error: "You are not allowed to access this resource.", error: "You are not allowed to access this resource.",

View File

@ -1,10 +1,7 @@
import { Request, Response } from "express"; import { Response } from "express";
import { Logger } from "../../lib/logger"; import { logger } from "../../lib/logger";
import { import {
Document, Document,
legacyDocumentConverter,
legacyExtractorOptions,
legacyScrapeOptions,
RequestWithAuth, RequestWithAuth,
ScrapeRequest, ScrapeRequest,
scrapeRequestSchema, scrapeRequestSchema,
@ -12,7 +9,6 @@ import {
} from "./types"; } from "./types";
import { billTeam } from "../../services/billing/credit_billing"; import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { logJob } from "../../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { getJobPriority } from "../../lib/job-priority"; import { getJobPriority } from "../../lib/job-priority";
@ -28,8 +24,6 @@ export async function scrapeController(
const origin = req.body.origin; const origin = req.body.origin;
const timeout = req.body.timeout; const timeout = req.body.timeout;
const pageOptions = legacyScrapeOptions(req.body);
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
const jobId = uuidv4(); const jobId = uuidv4();
const startTime = new Date().getTime(); const startTime = new Date().getTime();
@ -43,11 +37,10 @@ export async function scrapeController(
{ {
url: req.body.url, url: req.body.url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: {},
team_id: req.auth.team_id, team_id: req.auth.team_id,
plan: req.auth.plan, scrapeOptions: req.body,
pageOptions, internalOptions: {},
extractorOptions, plan: req.auth.plan!,
origin: req.body.origin, origin: req.body.origin,
is_scrape: true, is_scrape: true,
}, },
@ -56,13 +49,13 @@ export async function scrapeController(
jobPriority jobPriority
); );
const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0); const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
let doc: any | undefined; let doc: Document;
try { try {
doc = (await waitForJob(jobId, timeout + totalWait))[0]; doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
} catch (e) { } catch (e) {
Logger.error(`Error in scrapeController: ${e}`); logger.error(`Error in scrapeController: ${e}`);
if (e instanceof Error && e.message.startsWith("Job wait")) { if (e instanceof Error && e.message.startsWith("Job wait")) {
return res.status(408).json({ return res.status(408).json({
success: false, success: false,
@ -71,34 +64,19 @@ export async function scrapeController(
} else { } else {
return res.status(500).json({ return res.status(500).json({
success: false, success: false,
error: `(Internal server error) - ${e && e?.message ? e.message : e} ${ error: `(Internal server error) - ${e && e?.message ? e.message : e}`,
extractorOptions && extractorOptions.mode !== "markdown"
? " - Could be due to LLM parsing issues"
: ""
}`,
}); });
} }
} }
await getScrapeQueue().remove(jobId); await getScrapeQueue().remove(jobId);
if (!doc) {
console.error("!!! PANIC DOC IS", doc);
return res.status(200).json({
success: true,
warning: "No page found",
data: doc,
});
}
delete doc.index;
delete doc.provider;
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;
const numTokens = const numTokens =
doc && doc.markdown doc && doc.extract
? numTokensFromString(doc.markdown, "gpt-3.5-turbo") // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
? 0 // TODO: fix
: 0; : 0;
let creditsToBeBilled = 1; // Assuming 1 credit per document let creditsToBeBilled = 1; // Assuming 1 credit per document
@ -111,22 +89,16 @@ export async function scrapeController(
} }
billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => { billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`); logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here // Optionally, you could notify an admin or add to a retry queue here
}); });
if (!pageOptions || !pageOptions.includeRawHtml) { if (!req.body.formats.includes("rawHtml")) {
if (doc && doc.rawHtml) { if (doc && doc.rawHtml) {
delete doc.rawHtml; delete doc.rawHtml;
} }
} }
if(pageOptions && pageOptions.includeExtract) {
if(!pageOptions.includeMarkdown && doc && doc.markdown) {
delete doc.markdown;
}
}
logJob({ logJob({
job_id: jobId, job_id: jobId,
success: true, success: true,
@ -137,16 +109,14 @@ export async function scrapeController(
team_id: req.auth.team_id, team_id: req.auth.team_id,
mode: "scrape", mode: "scrape",
url: req.body.url, url: req.body.url,
crawlerOptions: {}, scrapeOptions: req.body,
pageOptions: pageOptions,
origin: origin, origin: origin,
extractor_options: extractorOptions,
num_tokens: numTokens, num_tokens: numTokens,
}); });
return res.status(200).json({ return res.status(200).json({
success: true, success: true,
data: legacyDocumentConverter(doc), data: doc,
scrape_id: origin?.includes("website") ? jobId : undefined, scrape_id: origin?.includes("website") ? jobId : undefined,
}); });
} }

View File

@ -1,10 +1,11 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { z } from "zod"; import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { Action, ExtractorOptions, PageOptions } from "../../lib/entities";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { PlanType } from "../../types"; import { PlanType } from "../../types";
import { countries } from "../../lib/validate-country"; import { countries } from "../../lib/validate-country";
import { ExtractorOptions, PageOptions, ScrapeActionContent, Document as V0Document } from "../../lib/entities";
import { InternalOptions } from "../../scraper/scrapeURL";
export type Format = export type Format =
| "markdown" | "markdown"
@ -167,6 +168,7 @@ export const scrapeRequestSchema = scrapeOptions.extend({
}); });
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>; export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
export const batchScrapeRequestSchema = scrapeOptions.extend({ export const batchScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(), urls: url.array(),
@ -240,7 +242,7 @@ export const mapRequestSchema = crawlerOptions.extend({
includeSubdomains: z.boolean().default(true), includeSubdomains: z.boolean().default(true),
search: z.string().optional(), search: z.string().optional(),
ignoreSitemap: z.boolean().default(false), ignoreSitemap: z.boolean().default(false),
limit: z.number().min(1).max(5000).default(5000).optional(), limit: z.number().min(1).max(5000).default(5000),
}).strict(strictMessage); }).strict(strictMessage);
// export type MapRequest = { // export type MapRequest = {
@ -252,13 +254,14 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = { export type Document = {
markdown?: string; markdown?: string;
extract?: string; extract?: any;
html?: string; html?: string;
rawHtml?: string; rawHtml?: string;
links?: string[]; links?: string[];
screenshot?: string; screenshot?: string;
actions?: { actions?: {
screenshots: string[]; screenshots?: string[];
scrapes?: ScrapeActionContent[];
}; };
warning?: string; warning?: string;
metadata: { metadata: {
@ -291,11 +294,11 @@ export type Document = {
publishedTime?: string; publishedTime?: string;
articleTag?: string; articleTag?: string;
articleSection?: string; articleSection?: string;
url?: string;
sourceURL?: string; sourceURL?: string;
statusCode?: number; statusCode?: number;
error?: string; error?: string;
[key: string]: string | string[] | number | undefined; [key: string]: string | string[] | number | undefined;
}; };
}; };
@ -366,7 +369,7 @@ export type CrawlStatusResponse =
type AuthObject = { type AuthObject = {
team_id: string; team_id: string;
plan: PlanType; plan: PlanType | undefined;
}; };
type Account = { type Account = {
@ -439,7 +442,7 @@ export interface ResponseWithSentry<
sentry?: string, sentry?: string,
} }
export function legacyCrawlerOptions(x: CrawlerOptions) { export function toLegacyCrawlerOptions(x: CrawlerOptions) {
return { return {
includes: x.includePaths, includes: x.includePaths,
excludes: x.excludePaths, excludes: x.excludePaths,
@ -453,68 +456,90 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
}; };
} }
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } {
return { return {
includeMarkdown: x.formats.includes("markdown"), crawlOptions: crawlerOptions.parse({
includeHtml: x.formats.includes("html"), includePaths: x.includes,
includeRawHtml: x.formats.includes("rawHtml"), excludePaths: x.excludes,
includeExtract: x.formats.includes("extract"), limit: x.maxCrawledLinks ?? x.limit,
onlyIncludeTags: x.includeTags, maxDepth: x.maxDepth,
removeTags: x.excludeTags, allowBackwardLinks: x.allowBackwardCrawling,
onlyMainContent: x.onlyMainContent, allowExternalLinks: x.allowExternalContentLinks,
waitFor: x.waitFor, ignoreSitemap: x.ignoreSitemap,
headers: x.headers, // TODO: returnOnlyUrls support
includeLinks: x.formats.includes("links"), }),
screenshot: x.formats.includes("screenshot"), internalOptions: {
fullPageScreenshot: x.formats.includes("screenshot@fullPage"), v0CrawlOnlyUrls: x.returnOnlyUrls,
parsePDF: x.parsePDF,
actions: x.actions as Action[], // no strict null checking grrrr - mogery
geolocation: x.location ?? x.geolocation,
skipTlsVerification: x.skipTlsVerification,
removeBase64Images: x.removeBase64Images,
mobile: x.mobile,
};
}
export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
return {
mode: x.mode ? "llm-extraction" : "markdown",
extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.",
extractionSchema: x.schema,
userPrompt: x.prompt ?? "",
};
}
export function legacyDocumentConverter(doc: any): Document {
if (doc === null || doc === undefined) return null;
if (doc.metadata) {
if (doc.metadata.screenshot) {
doc.screenshot = doc.metadata.screenshot;
delete doc.metadata.screenshot;
}
if (doc.metadata.fullPageScreenshot) {
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
delete doc.metadata.fullPageScreenshot;
}
}
return {
markdown: doc.markdown,
links: doc.linksOnPage,
rawHtml: doc.rawHtml,
html: doc.html,
extract: doc.llm_extraction,
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
actions: doc.actions ?? undefined,
warning: doc.warning ?? undefined,
metadata: {
...doc.metadata,
pageError: undefined,
pageStatusCode: undefined,
error: doc.metadata?.pageError,
statusCode: doc.metadata?.pageStatusCode,
}, },
}; };
} }
export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
return {
scrapeOptions: scrapeOptions.parse({
formats: [
(pageOptions.includeMarkdown ?? true) ? "markdown" as const : null,
(pageOptions.includeHtml ?? false) ? "html" as const : null,
(pageOptions.includeRawHtml ?? false) ? "rawHtml" as const : null,
(pageOptions.screenshot ?? false) ? "screenshot" as const : null,
(pageOptions.fullPageScreenshot ?? false) ? "screenshot@fullPage" as const : null,
(extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction")) ? "extract" as const : null,
"links"
].filter(x => x !== null),
waitFor: pageOptions.waitFor,
headers: pageOptions.headers,
includeTags: (typeof pageOptions.onlyIncludeTags === "string" ? [pageOptions.onlyIncludeTags] : pageOptions.onlyIncludeTags),
excludeTags: (typeof pageOptions.removeTags === "string" ? [pageOptions.removeTags] : pageOptions.removeTags),
onlyMainContent: pageOptions.onlyMainContent ?? false,
timeout: timeout,
parsePDF: pageOptions.parsePDF,
actions: pageOptions.actions,
location: pageOptions.geolocation,
skipTlsVerification: pageOptions.skipTlsVerification,
removeBase64Images: pageOptions.removeBase64Images,
extract: extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction") ? {
systemPrompt: extractorOptions.extractionPrompt,
prompt: extractorOptions.userPrompt,
schema: extractorOptions.extractionSchema,
} : undefined,
mobile: pageOptions.mobile,
}),
internalOptions: {
atsv: pageOptions.atsv,
v0DisableJsDom: pageOptions.disableJsDom,
v0UseFastMode: pageOptions.useFastMode,
},
// TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks
}
}
export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} {
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
}
export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } {
if (internalOptions.v0CrawlOnlyUrls) {
return { url: document.metadata.sourceURL! };
}
return {
content: document.markdown!,
markdown: document.markdown!,
html: document.html,
rawHtml: document.rawHtml,
linksOnPage: document.links,
llm_extraction: document.extract,
metadata: {
...document.metadata,
error: undefined,
statusCode: undefined,
pageError: document.metadata.error,
pageStatusCode: document.metadata.statusCode,
screenshot: document.screenshot,
},
actions: document.actions ,
warning: document.warning,
}
}

View File

@ -1,19 +0,0 @@
import { WebScraperDataProvider } from "./scraper/WebScraper";
async function example() {
const example = new WebScraperDataProvider();
await example.setOptions({
jobId: "TEST",
mode: "crawl",
urls: ["https://mendable.ai"],
crawlerOptions: {},
});
const docs = await example.getDocuments(false);
docs.map((doc) => {
console.log(doc.metadata.sourceURL);
});
console.log(docs.length);
}
// example();

View File

@ -6,28 +6,24 @@ import bodyParser from "body-parser";
import cors from "cors"; import cors from "cors";
import { getScrapeQueue } from "./services/queue-service"; import { getScrapeQueue } from "./services/queue-service";
import { v0Router } from "./routes/v0"; import { v0Router } from "./routes/v0";
import { initSDK } from "@hyperdx/node-opentelemetry";
import os from "os"; import os from "os";
import { Logger } from "./lib/logger"; import { logger } from "./lib/logger";
import { adminRouter } from "./routes/admin"; import { adminRouter } from "./routes/admin";
import { ScrapeEvents } from "./lib/scrape-events";
import http from 'node:http'; import http from 'node:http';
import https from 'node:https'; import https from 'node:https';
import CacheableLookup from 'cacheable-lookup'; import CacheableLookup from 'cacheable-lookup';
import { v1Router } from "./routes/v1"; import { v1Router } from "./routes/v1";
import expressWs from "express-ws"; import expressWs from "express-ws";
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
import { ZodError } from "zod"; import { ZodError } from "zod";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import dns from 'node:dns';
const { createBullBoard } = require("@bull-board/api"); const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { BullAdapter } = require("@bull-board/api/bullAdapter");
const { ExpressAdapter } = require("@bull-board/express"); const { ExpressAdapter } = require("@bull-board/express");
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
Logger.info(`Number of CPUs: ${numCPUs} available`); logger.info(`Number of CPUs: ${numCPUs} available`);
const cacheable = new CacheableLookup() const cacheable = new CacheableLookup()
@ -55,7 +51,6 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({
serverAdapter: serverAdapter, serverAdapter: serverAdapter,
}); });
app.use( app.use(
`/admin/${process.env.BULL_AUTH_KEY}/queues`, `/admin/${process.env.BULL_AUTH_KEY}/queues`,
serverAdapter.getRouter() serverAdapter.getRouter()
@ -78,15 +73,10 @@ app.use(adminRouter);
const DEFAULT_PORT = process.env.PORT ?? 3002; const DEFAULT_PORT = process.env.PORT ?? 3002;
const HOST = process.env.HOST ?? "localhost"; const HOST = process.env.HOST ?? "localhost";
// HyperDX OpenTelemetry
if (process.env.ENV === "production") {
initSDK({ consoleCapture: true, additionalInstrumentations: [] });
}
function startServer(port = DEFAULT_PORT) { function startServer(port = DEFAULT_PORT) {
const server = app.listen(Number(port), HOST, () => { const server = app.listen(Number(port), HOST, () => {
Logger.info(`Worker ${process.pid} listening on port ${port}`); logger.info(`Worker ${process.pid} listening on port ${port}`);
Logger.info( logger.info(
`For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`
); );
}); });
@ -103,7 +93,6 @@ app.get(`/serverHealthCheck`, async (req, res) => {
const [waitingJobs] = await Promise.all([ const [waitingJobs] = await Promise.all([
scrapeQueue.getWaitingCount(), scrapeQueue.getWaitingCount(),
]); ]);
const noWaitingJobs = waitingJobs === 0; const noWaitingJobs = waitingJobs === 0;
// 200 if no active jobs, 503 if there are active jobs // 200 if no active jobs, 503 if there are active jobs
return res.status(noWaitingJobs ? 200 : 500).json({ return res.status(noWaitingJobs ? 200 : 500).json({
@ -111,7 +100,7 @@ app.get(`/serverHealthCheck`, async (req, res) => {
}); });
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(error); logger.error(error);
return res.status(500).json({ error: error.message }); return res.status(500).json({ error: error.message });
} }
}); });
@ -140,7 +129,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
// Re-check the waiting jobs count after the timeout // Re-check the waiting jobs count after the timeout
waitingJobsCount = await getWaitingJobsCount(); waitingJobsCount = await getWaitingJobsCount();
if (waitingJobsCount >= treshold) { if (waitingJobsCount >= treshold) {
const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL!;
const message = { const message = {
text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${ text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${
timeout / 60000 timeout / 60000
@ -156,14 +145,14 @@ app.get("/serverHealthCheck/notify", async (req, res) => {
}); });
if (!response.ok) { if (!response.ok) {
Logger.error("Failed to send Slack notification"); logger.error("Failed to send Slack notification");
} }
} }
}, timeout); }, timeout);
} }
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.debug(error); logger.debug(error);
} }
}; };
@ -178,7 +167,7 @@ app.get("/is-production", (req, res) => {
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => { app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response<ErrorResponse>, next: NextFunction) => {
if (err instanceof ZodError) { if (err instanceof ZodError) {
if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) { if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) {
Logger.warn("Unsupported protocol error: " + JSON.stringify(req.body)); logger.warn("Unsupported protocol error: " + JSON.stringify(req.body));
} }
res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
@ -206,11 +195,11 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
} }
} }
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
}); });
Logger.info(`Worker ${process.pid} started`); logger.info(`Worker ${process.pid} started`);
// const sq = getScrapeQueue(); // const sq = getScrapeQueue();

View File

@ -4,19 +4,19 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation
import { generateOpenAICompletions } from "./models"; import { generateOpenAICompletions } from "./models";
import { Document, ExtractorOptions } from "../entities"; import { Document, ExtractorOptions } from "../entities";
import { Logger } from "../logger"; import { logger } from "../logger";
// Generate completion using OpenAI // Generate completion using OpenAI
export async function generateCompletions( export async function generateCompletions(
documents: Document[], documents: Document[],
extractionOptions: ExtractorOptions, extractionOptions: ExtractorOptions | undefined,
mode: "markdown" | "raw-html" mode: "markdown" | "raw-html"
): Promise<Document[]> { ): Promise<Document[]> {
// const schema = zodToJsonSchema(options.schema) // const schema = zodToJsonSchema(options.schema)
const schema = extractionOptions.extractionSchema; const schema = extractionOptions?.extractionSchema;
const systemPrompt = extractionOptions.extractionPrompt; const systemPrompt = extractionOptions?.extractionPrompt;
const prompt = extractionOptions.userPrompt; const prompt = extractionOptions?.userPrompt;
const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider
@ -51,7 +51,7 @@ export async function generateCompletions(
return completionResult; return completionResult;
} catch (error) { } catch (error) {
Logger.error(`Error generating completions: ${error}`); logger.error(`Error generating completions: ${error}`);
throw error; throw error;
} }
default: default:

View File

@ -95,7 +95,7 @@ export async function generateOpenAICompletions({
try { try {
llmExtraction = JSON.parse( llmExtraction = JSON.parse(
jsonCompletion.choices[0].message.content.trim() (jsonCompletion.choices[0].message.content ?? "").trim()
); );
} catch (e) { } catch (e) {
throw new Error("Invalid JSON"); throw new Error("Invalid JSON");

View File

@ -3,7 +3,7 @@ export async function batchProcess<T>(
batchSize: number, batchSize: number,
asyncFunction: (item: T, index: number) => Promise<void> asyncFunction: (item: T, index: number) => Promise<void>
): Promise<void> { ): Promise<void> {
const batches = []; const batches: T[][] = [];
for (let i = 0; i < array.length; i += batchSize) { for (let i = 0; i < array.length; i += batchSize) {
const batch = array.slice(i, i + batchSize); const batch = array.slice(i, i + batchSize);
batches.push(batch); batches.push(batch);

View File

@ -1,13 +1,16 @@
import { InternalOptions } from "../scraper/scrapeURL";
import { ScrapeOptions } from "../controllers/v1/types";
import { WebCrawler } from "../scraper/WebScraper/crawler"; import { WebCrawler } from "../scraper/WebScraper/crawler";
import { redisConnection } from "../services/queue-service"; import { redisConnection } from "../services/queue-service";
import { Logger } from "./logger"; import { logger } from "./logger";
export type StoredCrawl = { export type StoredCrawl = {
originUrl?: string; originUrl?: string;
crawlerOptions: any; crawlerOptions: any;
pageOptions: any; scrapeOptions: Omit<ScrapeOptions, "timeout">;
internalOptions: InternalOptions;
team_id: string; team_id: string;
plan: string; plan?: string;
robots?: string; robots?: string;
cancelled?: boolean; cancelled?: boolean;
createdAt: number; createdAt: number;
@ -100,7 +103,7 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
urlO.hash = ""; urlO.hash = "";
url = urlO.href; url = urlO.href;
} catch (error) { } catch (error) {
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
} }
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
@ -117,7 +120,7 @@ export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
urlO.hash = ""; urlO.hash = "";
return urlO.href; return urlO.href;
} catch (error) { } catch (error) {
Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
} }
return url; return url;
@ -131,7 +134,7 @@ export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
const crawler = new WebCrawler({ const crawler = new WebCrawler({
jobId: id, jobId: id,
initialUrl: sc.originUrl, initialUrl: sc.originUrl!,
includes: sc.crawlerOptions?.includes ?? [], includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [], excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,

View File

@ -1,3 +1,5 @@
import type { Document as V1Document } from "../controllers/v1/types";
export interface Progress { export interface Progress {
current: number; current: number;
total: number; total: number;
@ -129,7 +131,8 @@ export class Document {
provider?: string; provider?: string;
warning?: string; warning?: string;
actions?: { actions?: {
screenshots: string[]; screenshots?: string[];
scrapes?: ScrapeActionContent[];
} }
index?: number; index?: number;

View File

@ -5,7 +5,7 @@ import "../services/sentry"
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import dotenv from 'dotenv'; import dotenv from 'dotenv';
import { Logger } from './logger'; import { logger } from './logger';
dotenv.config(); dotenv.config();
// TODO: add a timeout to the Go parser // TODO: add a timeout to the Go parser
@ -40,7 +40,7 @@ class GoMarkdownConverter {
} }
} }
export async function parseMarkdown(html: string): Promise<string> { export async function parseMarkdown(html: string | null | undefined): Promise<string> {
if (!html) { if (!html) {
return ''; return '';
} }
@ -52,12 +52,12 @@ export async function parseMarkdown(html: string): Promise<string> {
markdownContent = processMultiLineLinks(markdownContent); markdownContent = processMultiLineLinks(markdownContent);
markdownContent = removeSkipToContentLinks(markdownContent); markdownContent = removeSkipToContentLinks(markdownContent);
Logger.info(`HTML to Markdown conversion using Go parser successful`); logger.info(`HTML to Markdown conversion using Go parser successful`);
return markdownContent; return markdownContent;
} }
} catch (error) { } catch (error) {
Sentry.captureException(error); Sentry.captureException(error);
Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`); logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
} }
// Fallback to TurndownService if Go parser fails or is not enabled // Fallback to TurndownService if Go parser fails or is not enabled

View File

@ -1,6 +1,6 @@
import { redisConnection } from "../../src/services/queue-service"; import { redisConnection } from "../../src/services/queue-service";
import { PlanType } from "../../src/types"; import { PlanType } from "../../src/types";
import { Logger } from "./logger"; import { logger } from "./logger";
const SET_KEY_PREFIX = "limit_team_id:"; const SET_KEY_PREFIX = "limit_team_id:";
export async function addJobPriority(team_id, job_id) { export async function addJobPriority(team_id, job_id) {
@ -13,7 +13,7 @@ export async function addJobPriority(team_id, job_id) {
// This approach will reset the expiration time to 60 seconds every time a new job is added to the set. // This approach will reset the expiration time to 60 seconds every time a new job is added to the set.
await redisConnection.expire(setKey, 60); await redisConnection.expire(setKey, 60);
} catch (e) { } catch (e) {
Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`); logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`);
} }
} }
@ -24,7 +24,7 @@ export async function deleteJobPriority(team_id, job_id) {
// remove job_id from the set // remove job_id from the set
await redisConnection.srem(setKey, job_id); await redisConnection.srem(setKey, job_id);
} catch (e) { } catch (e) {
Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`); logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`);
} }
} }
@ -33,7 +33,7 @@ export async function getJobPriority({
team_id, team_id,
basePriority = 10, basePriority = 10,
}: { }: {
plan: PlanType; plan: PlanType | undefined;
team_id: string; team_id: string;
basePriority?: number; basePriority?: number;
}): Promise<number> { }): Promise<number> {
@ -95,7 +95,7 @@ export async function getJobPriority({
); );
} }
} catch (e) { } catch (e) {
Logger.error( logger.error(
`Get job priority failed: ${team_id}, ${plan}, ${basePriority}` `Get job priority failed: ${team_id}, ${plan}, ${basePriority}`
); );
return basePriority; return basePriority;

View File

@ -1,42 +0,0 @@
// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url";
// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
// const scrapInBatches = async (
// urls: string[],
// batchSize: number,
// delayMs: number
// ) => {
// let successCount = 0;
// let errorCount = 0;
// for (let i = 0; i < urls.length; i += batchSize) {
// const batch = urls
// .slice(i, i + batchSize)
// .map((url) => scrapWithFireEngine(url));
// try {
// const results = await Promise.all(batch);
// results.forEach((data, index) => {
// if (data.trim() === "") {
// errorCount++;
// } else {
// successCount++;
// console.log(
// `Scraping result ${i + index + 1}:`,
// data.trim().substring(0, 20) + "..."
// );
// }
// });
// } catch (error) {
// console.error("Error during scraping:", error);
// }
// await delay(delayMs);
// }
// console.log(`Total successful scrapes: ${successCount}`);
// console.log(`Total errored scrapes: ${errorCount}`);
// };
// function run() {
// const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com");
// scrapInBatches(urls, 10, 1000);
// }

View File

@ -1,57 +1,82 @@
import * as winston from "winston";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import Transport from "winston-transport";
configDotenv(); configDotenv();
enum LogLevel { const logFormat = winston.format.printf(info =>
NONE = 'NONE', // No logs will be output. `${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${info.level.includes("error") || info.level.includes("warn") ? JSON.stringify(
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation. info.metadata,
WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors. (_, value) => {
INFO = 'INFO', // For logging informational messages that highlight the progress of the application. if (value instanceof Error) {
DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging. return {
TRACE = 'TRACE' // For logging more detailed information than the DEBUG level. ...value,
} name: value.name,
export class Logger { message: value.message,
static colors = { stack: value.stack,
ERROR: '\x1b[31m%s\x1b[0m', // Red cause: value.cause,
WARN: '\x1b[33m%s\x1b[0m', // Yellow }
INFO: '\x1b[34m%s\x1b[0m', // Blue } else {
DEBUG: '\x1b[36m%s\x1b[0m', // Cyan return value;
TRACE: '\x1b[35m%s\x1b[0m' // Magenta }
};
static log (message: string, level: LogLevel) {
const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.TRACE;
const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE];
const currentLevelIndex = levels.indexOf(logLevel);
const messageLevelIndex = levels.indexOf(level);
if (currentLevelIndex >= messageLevelIndex) {
const color = Logger.colors[level];
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
// const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
// if (useDbAuthentication) {
// save to supabase? another place?
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
// }
} }
} ) : ""}`
static error(message: string | any) { )
Logger.log(message, LogLevel.ERROR);
export const logger = winston.createLogger({
level: process.env.LOGGING_LEVEL?.toLowerCase() ?? "debug",
format: winston.format.json({
replacer(key, value) {
if (value instanceof Error) {
return {
...value,
name: value.name,
message: value.message,
stack: value.stack,
cause: value.cause,
}
} else {
return value;
}
}
}),
transports: [
new winston.transports.Console({
format: winston.format.combine(
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
winston.format.metadata({ fillExcept: ["message", "level", "timestamp"] }),
...(((process.env.ENV === "production" && process.env.SENTRY_ENVIRONMENT === "dev") || (process.env.ENV !== "production")) ? [winston.format.colorize(), logFormat] : []),
),
}),
],
});
export type ArrayTransportOptions = Transport.TransportStreamOptions & {
array: any[];
scrapeId?: string;
};
export class ArrayTransport extends Transport {
private array: any[];
private scrapeId?: string;
constructor(opts: ArrayTransportOptions) {
super(opts);
this.array = opts.array;
this.scrapeId = opts.scrapeId;
} }
static warn(message: string) { log(info, next) {
Logger.log(message, LogLevel.WARN); setImmediate(() => {
} this.emit("logged", info);
});
static info(message: string) { if (this.scrapeId !== undefined && info.scrapeId !== this.scrapeId) {
Logger.log(message, LogLevel.INFO); return next();
} }
static debug(message: string) { this.array.push(info);
Logger.log(message, LogLevel.DEBUG);
}
static trace(message: string) { next();
Logger.log(message, LogLevel.TRACE);
} }
} }

View File

@ -1,4 +1,4 @@
import { Logger } from "./logger"; import { logger } from "./logger";
export function performCosineSimilarity(links: string[], searchQuery: string) { export function performCosineSimilarity(links: string[], searchQuery: string) {
try { try {
@ -40,7 +40,7 @@ export function performCosineSimilarity(links: string[], searchQuery: string) {
links = a.map((item) => item.link); links = a.map((item) => item.link);
return links; return links;
} catch (error) { } catch (error) {
Logger.error(`Error performing cosine similarity: ${error}`); logger.error(`Error performing cosine similarity: ${error}`);
return links; return links;
} }
} }

View File

@ -1,8 +1,8 @@
import { Job } from "bullmq"; import { Job } from "bullmq";
import type { baseScrapers } from "../scraper/WebScraper/single_url";
import { supabase_service as supabase } from "../services/supabase"; import { supabase_service as supabase } from "../services/supabase";
import { Logger } from "./logger"; import { logger } from "./logger";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { Engine } from "../scraper/scrapeURL/engines";
configDotenv(); configDotenv();
export type ScrapeErrorEvent = { export type ScrapeErrorEvent = {
@ -15,7 +15,7 @@ export type ScrapeScrapeEvent = {
type: "scrape", type: "scrape",
url: string, url: string,
worker?: string, worker?: string,
method: (typeof baseScrapers)[number], method: Engine,
result: null | { result: null | {
success: boolean, success: boolean,
response_code?: number, response_code?: number,
@ -49,7 +49,7 @@ export class ScrapeEvents {
}).select().single(); }).select().single();
return (result.data as any).id; return (result.data as any).id;
} catch (error) { } catch (error) {
// Logger.error(`Error inserting scrape event: ${error}`); // logger.error(`Error inserting scrape event: ${error}`);
return null; return null;
} }
} }
@ -69,7 +69,7 @@ export class ScrapeEvents {
} }
}).eq("id", logId); }).eq("id", logId);
} catch (error) { } catch (error) {
Logger.error(`Error updating scrape result: ${error}`); logger.error(`Error updating scrape result: ${error}`);
} }
} }
@ -81,7 +81,7 @@ export class ScrapeEvents {
worker: process.env.FLY_MACHINE_ID, worker: process.env.FLY_MACHINE_ID,
}); });
} catch (error) { } catch (error) {
Logger.error(`Error logging job event: ${error}`); logger.error(`Error logging job event: ${error}`);
} }
} }
} }

View File

@ -1,5 +1,5 @@
import { supabase_service } from "../services/supabase"; import { supabase_service } from "../services/supabase";
import { Logger } from "./logger"; import { logger } from "./logger";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
/** /**
@ -37,7 +37,7 @@ export const supabaseGetJobsById = async (jobIds: string[]) => {
.in("job_id", jobIds); .in("job_id", jobIds);
if (error) { if (error) {
Logger.error(`Error in supabaseGetJobsById: ${error}`); logger.error(`Error in supabaseGetJobsById: ${error}`);
Sentry.captureException(error); Sentry.captureException(error);
return []; return [];
} }
@ -61,7 +61,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
.eq("crawl_id", crawlId) .eq("crawl_id", crawlId)
if (error) { if (error) {
Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`); logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
Sentry.captureException(error); Sentry.captureException(error);
return []; return [];
} }

View File

@ -1,30 +1,25 @@
import { AuthResponse } from "../../src/types"; import { AuthResponse } from "../../src/types";
import { Logger } from "./logger"; import { logger } from "./logger";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
configDotenv(); configDotenv();
let warningCount = 0; let warningCount = 0;
export function withAuth<T extends AuthResponse, U extends any[]>( export function withAuth<T, U extends any[]>(
originalFunction: (...args: U) => Promise<T> originalFunction: (...args: U) => Promise<T>,
mockSuccess: T,
) { ) {
return async function (...args: U): Promise<T> { return async function (...args: U): Promise<T> {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (!useDbAuthentication) { if (!useDbAuthentication) {
if (warningCount < 5) { if (warningCount < 5) {
Logger.warn("You're bypassing authentication"); logger.warn("You're bypassing authentication");
warningCount++; warningCount++;
} }
return { success: true } as T; return { success: true } as T;
} else { } else {
try { return await originalFunction(...args);
return await originalFunction(...args);
} catch (error) {
Sentry.captureException(error);
Logger.error(`Error in withAuth function: ${error}`);
return { success: false, error: error.message } as T;
}
} }
}; };
} }

View File

@ -1,151 +1,127 @@
import { Job } from "bullmq"; import { Job } from "bullmq";
import { import {
CrawlResult,
WebScraperOptions, WebScraperOptions,
RunWebScraperParams, RunWebScraperParams,
RunWebScraperResult, RunWebScraperResult,
} from "../types"; } from "../types";
import { WebScraperDataProvider } from "../scraper/WebScraper";
import { DocumentUrl, Progress } from "../lib/entities";
import { billTeam } from "../services/billing/credit_billing"; import { billTeam } from "../services/billing/credit_billing";
import { Document } from "../lib/entities"; import { Document } from "../controllers/v1/types";
import { supabase_service } from "../services/supabase"; import { supabase_service } from "../services/supabase";
import { Logger } from "../lib/logger"; import { logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events"; import { ScrapeEvents } from "../lib/scrape-events";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL";
import { Engine } from "../scraper/scrapeURL/engines";
configDotenv(); configDotenv();
export async function startWebScraperPipeline({ export async function startWebScraperPipeline({
job, job,
token, token,
}: { }: {
job: Job<WebScraperOptions>; job: Job<WebScraperOptions> & { id: string };
token: string; token: string;
}) { }) {
let partialDocs: Document[] = [];
return (await runWebScraper({ return (await runWebScraper({
url: job.data.url, url: job.data.url,
mode: job.data.mode, mode: job.data.mode,
crawlerOptions: job.data.crawlerOptions, scrapeOptions: {
extractorOptions: job.data.extractorOptions, ...job.data.scrapeOptions,
pageOptions: {
...job.data.pageOptions,
...(job.data.crawl_id ? ({ ...(job.data.crawl_id ? ({
includeRawHtml: true, formats: job.data.scrapeOptions.formats.concat(["rawHtml"]),
}): {}), }): {}),
}, },
inProgress: (progress) => { internalOptions: job.data.internalOptions,
Logger.debug(`🐂 Job in progress ${job.id}`); // onSuccess: (result, mode) => {
if (progress.currentDocument) { // logger.debug(`🐂 Job completed ${job.id}`);
partialDocs.push(progress.currentDocument); // saveJob(job, result, token, mode);
if (partialDocs.length > 50) { // },
partialDocs = partialDocs.slice(-50); // onError: (error) => {
} // logger.error(`🐂 Job failed ${job.id}`);
// job.updateProgress({ ...progress, partialDocs: partialDocs }); // ScrapeEvents.logJobEvent(job, "failed");
} // },
},
onSuccess: (result, mode) => {
Logger.debug(`🐂 Job completed ${job.id}`);
saveJob(job, result, token, mode);
},
onError: (error) => {
Logger.error(`🐂 Job failed ${job.id}`);
ScrapeEvents.logJobEvent(job, "failed");
job.moveToFailed(error, token, false);
},
team_id: job.data.team_id, team_id: job.data.team_id,
bull_job_id: job.id.toString(), bull_job_id: job.id.toString(),
priority: job.opts.priority, priority: job.opts.priority,
is_scrape: job.data.is_scrape ?? false, is_scrape: job.data.is_scrape ?? false,
})) as { success: boolean; message: string; docs: Document[] }; }));
} }
export async function runWebScraper({ export async function runWebScraper({
url, url,
mode, mode,
crawlerOptions, scrapeOptions,
pageOptions, internalOptions,
extractorOptions, // onSuccess,
inProgress, // onError,
onSuccess,
onError,
team_id, team_id,
bull_job_id, bull_job_id,
priority, priority,
is_scrape=false, is_scrape=false,
}: RunWebScraperParams): Promise<RunWebScraperResult> { }: RunWebScraperParams): Promise<ScrapeUrlResponse> {
let response: ScrapeUrlResponse | undefined = undefined;
let engines: EngineResultsTracker = {};
try { try {
const provider = new WebScraperDataProvider(); response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions });
if (mode === "crawl") { if (!response.success) {
await provider.setOptions({ if (response.error instanceof Error) {
jobId: bull_job_id, throw response.error;
mode: mode, } else {
urls: [url], throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error));
extractorOptions, }
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
bullJobId: bull_job_id,
priority,
});
} else {
await provider.setOptions({
jobId: bull_job_id,
mode: mode,
urls: url.split(","),
extractorOptions,
crawlerOptions: crawlerOptions,
pageOptions: pageOptions,
priority,
teamId: team_id
});
} }
const docs = (await provider.getDocuments(false, (progress: Progress) => {
inProgress(progress);
})) as Document[];
if (docs.length === 0) {
return {
success: true,
message: "No pages found",
docs: [],
};
}
// remove docs with empty content
const filteredDocs = crawlerOptions?.returnOnlyUrls
? docs.map((doc) => {
if (doc.metadata.sourceURL) {
return { url: doc.metadata.sourceURL };
}
})
: docs;
if(is_scrape === false) { if(is_scrape === false) {
let creditsToBeBilled = 1; // Assuming 1 credit per document let creditsToBeBilled = 1; // Assuming 1 credit per document
if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) { if (scrapeOptions.extract) {
creditsToBeBilled = 5; creditsToBeBilled = 5;
} }
billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => { billTeam(team_id, undefined, creditsToBeBilled).catch(error => {
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`); logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
// Optionally, you could notify an admin or add to a retry queue here // Optionally, you could notify an admin or add to a retry queue here
}); });
} }
// This is where the returnvalue from the job is set // This is where the returnvalue from the job is set
onSuccess(filteredDocs, mode); // onSuccess(response.document, mode);
// this return doesn't matter too much for the job completion result engines = response.engines;
return { success: true, message: "", docs: filteredDocs }; return response;
} catch (error) { } catch (error) {
onError(error); engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {}));
return { success: false, message: error.message, docs: [] };
if (response !== undefined) {
return {
...response,
success: false,
error,
}
} else {
return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines };
}
// onError(error);
} finally {
const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[];
for (const engine of engineOrder) {
const result = engines[engine] as Exclude<EngineResultsTracker[Engine], undefined>;
ScrapeEvents.insert(bull_job_id, {
type: "scrape",
url,
method: engine,
result: {
success: result.state === "success",
response_code: (result.state === "success" ? result.result.statusCode : undefined),
response_size: (result.state === "success" ? result.result.html.length : undefined),
error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined),
time_taken: result.finishedAt - result.startedAt,
},
});
}
} }
} }
const saveJob = async (job: Job, result: any, token: string, mode: string) => { const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => {
try { try {
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
if (useDbAuthentication) { if (useDbAuthentication) {
@ -173,6 +149,6 @@ const saveJob = async (job: Job, result: any, token: string, mode: string) => {
} }
ScrapeEvents.logJobEvent(job, "completed"); ScrapeEvents.logJobEvent(job, "completed");
} catch (error) { } catch (error) {
Logger.error(`🐂 Failed to update job status: ${error}`); logger.error(`🐂 Failed to update job status: ${error}`);
} }
}; };

View File

@ -6,8 +6,8 @@ import {
cleanBefore24hCompleteJobsController, cleanBefore24hCompleteJobsController,
queuesController, queuesController,
} from "../controllers/v0/admin/queue"; } from "../controllers/v0/admin/queue";
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
import { wrap } from "./v1"; import { wrap } from "./v1";
import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear";
export const adminRouter = express.Router(); export const adminRouter = express.Router();

View File

@ -14,7 +14,7 @@ import expressWs from "express-ws";
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws"; import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
import { crawlCancelController } from "../controllers/v1/crawl-cancel"; import { crawlCancelController } from "../controllers/v1/crawl-cancel";
import { Logger } from "../lib/logger"; import { logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { scrapeStatusController } from "../controllers/v1/scrape-status";
import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
import { batchScrapeController } from "../controllers/v1/batch-scrape"; import { batchScrapeController } from "../controllers/v1/batch-scrape";
@ -32,10 +32,12 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
if (!minimum && req.body) { if (!minimum && req.body) {
minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1; minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1;
} }
const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum); const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum ?? 1);
req.acuc = chunk; if (chunk) {
req.acuc = chunk;
}
if (!success) { if (!success) {
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
if (!res.headersSent) { if (!res.headersSent) {
return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." }); return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." });
} }
@ -50,20 +52,27 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void { export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => { return (req, res, next) => {
(async () => { (async () => {
const { success, team_id, error, status, plan, chunk } = await authenticateUser( const auth = await authenticateUser(
req, req,
res, res,
rateLimiterMode, rateLimiterMode,
); );
if (!success) { if (!auth.success) {
if (!res.headersSent) { if (!res.headersSent) {
return res.status(status).json({ success: false, error }); return res.status(auth.status).json({ success: false, error: auth.error });
} else {
return;
} }
} }
const { team_id, plan, chunk } = auth;
req.auth = { team_id, plan }; req.auth = { team_id, plan };
req.acuc = chunk; req.acuc = chunk ?? undefined;
if (chunk) {
req.account = { remainingCredits: chunk.remaining_credits };
}
next(); next();
})() })()
.catch(err => next(err)); .catch(err => next(err));

View File

@ -2,7 +2,6 @@
import { WebCrawler } from '../crawler'; import { WebCrawler } from '../crawler';
import axios from 'axios'; import axios from 'axios';
import robotsParser from 'robots-parser'; import robotsParser from 'robots-parser';
import { getAdjustedMaxDepth } from '../utils/maxDepthUtils';
jest.mock('axios'); jest.mock('axios');
jest.mock('robots-parser'); jest.mock('robots-parser');
@ -35,165 +34,6 @@ describe('WebCrawler', () => {
}); });
}); });
it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => {
const initialUrl = 'http://example.com'; // Set initial URL for this test
const enteredMaxCrawledDepth = 2;
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],
limit: 100,
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
});
// Mock sitemap fetching function to return controlled links
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
initialUrl, // depth 0
initialUrl + '/page1', // depth 1
initialUrl + '/page1/page2', // depth 2
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
]);
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
expect(results).toEqual([
{ url: initialUrl, html: '' },
{ url: initialUrl + '/page1', html: '' },
{ url: initialUrl + '/page1/page2', html: '' }
]);
// Ensure that the link with depth 3 is not included
expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false);
});
it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => {
const initialUrl = 'http://example.com'; // Set initial URL for this test
const enteredMaxCrawledDepth = 0;
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],
limit: 100,
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
});
// Mock sitemap fetching function to return controlled links
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
initialUrl, // depth 0
initialUrl + '/page1', // depth 1
initialUrl + '/page1/page2', // depth 2
initialUrl + '/page1/page2/page3' // depth 3, should be filtered out
]);
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
expect(results).toEqual([
{ url: initialUrl, html: '' },
]);
});
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => {
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
const enteredMaxCrawledDepth = 1;
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],
limit: 100,
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
});
// Mock sitemap fetching function to return controlled links
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
initialUrl, // depth 0
initialUrl + '/page2', // depth 1
initialUrl + '/page2/page3', // depth 2
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
]);
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
expect(results).toEqual([
{ url: initialUrl, html: '' },
{ url: initialUrl + '/page2', html: '' }
]);
});
it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => {
const initialUrl = 'http://example.com/page1'; // Set initial URL for this test
const enteredMaxCrawledDepth = 2;
maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth);
crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],
limit: 100,
maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing
});
// Mock sitemap fetching function to return controlled links
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
initialUrl, // depth 0
initialUrl + '/page2', // depth 1
initialUrl + '/page2/page3', // depth 2
initialUrl + '/page2/page3/page4' // depth 3, should be filtered out
]);
const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth);
expect(results).toEqual([
{ url: initialUrl, html: '' },
{ url: initialUrl + '/page2', html: '' },
{ url: initialUrl + '/page2/page3', html: '' }
]);
});
it('should handle allowBackwardCrawling option correctly', async () => {
const initialUrl = 'https://mendable.ai/blog';
// Setup the crawler with the specific test case options
const crawler = new WebCrawler({
jobId: "TEST",
initialUrl: initialUrl,
includes: [],
excludes: [],
limit: 100,
maxCrawledDepth: 3, // Example depth
allowBackwardCrawling: true
});
// Mock the sitemap fetching function to simulate backward crawling
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
initialUrl,
'https://mendable.ai', // backward link
initialUrl + '/page1',
initialUrl + '/page1/page2'
]);
const results = await crawler.start();
expect(results).toEqual([
{ url: initialUrl, html: '' },
{ url: 'https://mendable.ai', html: '' }, // Expect the backward link to be included
{ url: initialUrl + '/page1', html: '' },
{ url: initialUrl + '/page1/page2', html: '' }
]);
// Check that the backward link is included if allowBackwardCrawling is true
expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true);
});
it('should respect the limit parameter by not returning more links than specified', async () => { it('should respect the limit parameter by not returning more links than specified', async () => {
const initialUrl = 'http://example.com'; const initialUrl = 'http://example.com';
const limit = 2; // Set a limit for the number of links const limit = 2; // Set a limit for the number of links

View File

@ -1,37 +0,0 @@
import { scrapSingleUrl } from '../single_url';
import { PageOptions } from '../../../lib/entities';
jest.mock('../single_url', () => {
const originalModule = jest.requireActual('../single_url');
originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('<html><head><title>Test</title></head><body><h1>Roast</h1></body></html>');
return originalModule;
});
describe('scrapSingleUrl', () => {
it('should handle includeHtml option correctly', async () => {
const url = 'https://roastmywebsite.ai';
const pageOptionsWithHtml: PageOptions = { includeHtml: true };
const pageOptionsWithoutHtml: PageOptions = { includeHtml: false };
const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml);
const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml);
expect(resultWithHtml.html).toBeDefined();
expect(resultWithoutHtml.html).toBeUndefined();
}, 10000);
});
it('should return a list of links on the firecrawl.ai page', async () => {
const url = 'https://flutterbricks.com';
const pageOptions: PageOptions = { includeHtml: true };
const result = await scrapSingleUrl("TEST", url, pageOptions);
// Check if the result contains a list of links
expect(result.linksOnPage).toBeDefined();
expect(Array.isArray(result.linksOnPage)).toBe(true);
expect(result.linksOnPage.length).toBeGreaterThan(0);
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
}, 15000);

View File

@ -2,13 +2,10 @@ import axios, { AxiosError } from "axios";
import cheerio, { load } from "cheerio"; import cheerio, { load } from "cheerio";
import { URL } from "url"; import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
import async from "async";
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
import { scrapSingleUrl } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout"; import { axiosTimeout } from "../../../src/lib/timeout";
import { Logger } from "../../../src/lib/logger"; import { logger } from "../../../src/lib/logger";
import https from "https"; import https from "https";
export class WebCrawler { export class WebCrawler {
private jobId: string; private jobId: string;
@ -73,7 +70,7 @@ export class WebCrawler {
try { try {
url = new URL(link.trim(), this.baseUrl); url = new URL(link.trim(), this.baseUrl);
} catch (error) { } catch (error) {
Logger.debug(`Error processing link: ${link} | Error: ${error.message}`); logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
return false; return false;
} }
const path = url.pathname; const path = url.pathname;
@ -132,7 +129,7 @@ export class WebCrawler {
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
// Check if the link is disallowed by robots.txt // Check if the link is disallowed by robots.txt
if (!isAllowed) { if (!isAllowed) {
Logger.debug(`Link disallowed by robots.txt: ${link}`); logger.debug(`Link disallowed by robots.txt: ${link}`);
return false; return false;
} }
@ -161,7 +158,7 @@ export class WebCrawler {
} }
public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> { public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
Logger.debug(`Fetching sitemap links from ${this.initialUrl}`); logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
@ -170,115 +167,6 @@ export class WebCrawler {
return null; return null;
} }
public async start(
inProgress?: (progress: Progress) => void,
pageOptions?: PageOptions,
crawlerOptions?: CrawlerOptions,
concurrencyLimit: number = 5,
limit: number = 10000,
maxDepth: number = 10
): Promise<{ url: string, html: string }[]> {
Logger.debug(`Crawler starting with ${this.initialUrl}`);
// Fetch and parse robots.txt
try {
const txt = await this.getRobotsTxt();
this.importRobotsTxt(txt);
Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`);
} catch (error) {
Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
}
if (!crawlerOptions?.ignoreSitemap){
const sm = await this.tryGetSitemap();
if (sm !== null) {
return sm;
}
}
const urls = await this.crawlUrls(
[this.initialUrl],
pageOptions,
concurrencyLimit,
inProgress
);
if (
urls.length === 0 &&
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
) {
return [{ url: this.initialUrl, html: "" }];
}
// make sure to run include exclude here again
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
}
private async crawlUrls(
urls: string[],
pageOptions: PageOptions,
concurrencyLimit: number,
inProgress?: (progress: Progress) => void,
): Promise<{ url: string, html: string }[]> {
const queue = async.queue(async (task: string, callback) => {
Logger.debug(`Crawling ${task}`);
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
if (callback && typeof callback === "function") {
callback();
}
return;
}
const newUrls = await this.crawl(task, pageOptions);
// add the initial url if not already added
// if (this.visited.size === 1) {
// let normalizedInitial = this.initialUrl;
// if (!normalizedInitial.endsWith("/")) {
// normalizedInitial = normalizedInitial + "/";
// }
// if (!newUrls.some(page => page.url === this.initialUrl)) {
// newUrls.push({ url: this.initialUrl, html: "" });
// }
// }
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
if (inProgress && newUrls.length > 0) {
inProgress({
current: this.crawledUrls.size,
total: Math.min(this.maxCrawledLinks, this.limit),
status: "SCRAPING",
currentDocumentUrl: newUrls[newUrls.length - 1].url,
});
} else if (inProgress) {
inProgress({
current: this.crawledUrls.size,
total: Math.min(this.maxCrawledLinks, this.limit),
status: "SCRAPING",
currentDocumentUrl: task,
});
}
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
if (callback && typeof callback === "function") {
callback();
}
}, concurrencyLimit);
Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`);
queue.push(
urls.filter(
(url) =>
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
),
(err) => {
if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`);
}
);
await queue.drain();
Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`);
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
}
public filterURL(href: string, url: string): string | null { public filterURL(href: string, url: string): string | null {
let fullUrl = href; let fullUrl = href;
if (!href.startsWith("http")) { if (!href.startsWith("http")) {
@ -346,79 +234,9 @@ export class WebCrawler {
return links; return links;
} }
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
return [];
}
this.visited.add(url);
if (!url.startsWith("http")) {
url = "https://" + url;
}
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
return [];
}
try {
let content: string = "";
let pageStatusCode: number;
let pageError: string | undefined = undefined;
// If it is the first link, fetch with single url
if (this.visited.size === 1) {
const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true });
content = page.html ?? "";
pageStatusCode = page.metadata?.pageStatusCode;
pageError = page.metadata?.pageError || undefined;
} else {
const response = await axios.get(url, { timeout: axiosTimeout });
content = response.data ?? "";
pageStatusCode = response.status;
pageError = response.statusText != "OK" ? response.statusText : undefined;
}
const $ = load(content);
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
// Add the initial URL to the list of links
if (this.visited.size === 1) {
links.push({ url, html: content, pageStatusCode, pageError });
}
links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
if (this.visited.size === 1) {
return links;
}
// Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(link.url));
} catch (error) {
return [];
}
}
private isRobotsAllowed(url: string): boolean { private isRobotsAllowed(url: string): boolean {
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true) return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
} }
private normalizeCrawlUrl(url: string): string {
try{
const urlObj = new URL(url);
urlObj.searchParams.sort(); // Sort query parameters to normalize
return urlObj.toString();
} catch (error) {
return url;
}
}
private matchesIncludes(url: string): boolean {
if (this.includes.length === 0 || this.includes[0] == "") return true;
return this.includes.some((pattern) => new RegExp(pattern).test(url));
}
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean { private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
return this.excludes.some((pattern) => { return this.excludes.some((pattern) => {
@ -503,7 +321,7 @@ export class WebCrawler {
const urlWithoutQuery = url.split('?')[0].toLowerCase(); const urlWithoutQuery = url.split('?')[0].toLowerCase();
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext)); return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
} catch (error) { } catch (error) {
Logger.error(`Error processing URL in isFile: ${error}`); logger.error(`Error processing URL in isFile: ${error}`);
return false; return false;
} }
} }
@ -524,7 +342,6 @@ export class WebCrawler {
return socialMediaOrEmail.some((ext) => url.includes(ext)); return socialMediaOrEmail.some((ext) => url.includes(ext));
} }
//
private async tryFetchSitemapLinks(url: string): Promise<string[]> { private async tryFetchSitemapLinks(url: string): Promise<string[]> {
const normalizeUrl = (url: string) => { const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
@ -546,7 +363,7 @@ export class WebCrawler {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }); sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
} }
} catch (error) { } catch (error) {
Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
if (error instanceof AxiosError && error.response?.status === 404) { if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404 // ignore 404
} else { } else {
@ -565,7 +382,7 @@ export class WebCrawler {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
} }
} catch (error) { } catch (error) {
Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
if (error instanceof AxiosError && error.response?.status === 404) { if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404 // ignore 404
} else { } else {

View File

@ -1,4 +1,4 @@
import { Logger } from "../../../lib/logger"; import { logger } from "../../../lib/logger";
export async function handleCustomScraping( export async function handleCustomScraping(
text: string, text: string,
@ -6,7 +6,7 @@ export async function handleCustomScraping(
): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> { ): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> {
// Check for Readme Docs special case // Check for Readme Docs special case
if (text.includes('<meta name="readme-deploy"') && !url.includes('developers.notion.com')) { if (text.includes('<meta name="readme-deploy"') && !url.includes('developers.notion.com')) {
Logger.debug( logger.debug(
`Special use case detected for ${url}, using Fire Engine with wait time 1000ms` `Special use case detected for ${url}, using Fire Engine with wait time 1000ms`
); );
return { return {
@ -21,7 +21,7 @@ export async function handleCustomScraping(
// Check for Vanta security portals // Check for Vanta security portals
if (text.includes('<link href="https://static.vanta.com')) { if (text.includes('<link href="https://static.vanta.com')) {
Logger.debug( logger.debug(
`Vanta link detected for ${url}, using Fire Engine with wait time 3000ms` `Vanta link detected for ${url}, using Fire Engine with wait time 3000ms`
); );
return { return {
@ -36,7 +36,7 @@ export async function handleCustomScraping(
const googleDriveMetaMatch = text.match(googleDriveMetaPattern); const googleDriveMetaMatch = text.match(googleDriveMetaPattern);
if (googleDriveMetaMatch) { if (googleDriveMetaMatch) {
const url = googleDriveMetaMatch[1]; const url = googleDriveMetaMatch[1];
Logger.debug(`Google Drive PDF link detected: ${url}`); logger.debug(`Google Drive PDF link detected: ${url}`);
const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/); const fileIdMatch = url.match(/https:\/\/drive\.google\.com\/file\/d\/([^\/]+)\/view/);
if (fileIdMatch) { if (fileIdMatch) {

View File

@ -1 +0,0 @@
export const universalTimeout = 15000;

View File

@ -1,743 +0,0 @@
import {
Document,
ExtractorOptions,
PageOptions,
WebScraperOptions,
} from "../../lib/entities";
import { Progress } from "../../lib/entities";
import { scrapSingleUrl } from "./single_url";
import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
import { WebCrawler } from "./crawler";
import { getValue, setValue } from "../../services/redis";
import { getImageDescription } from "./utils/imageDescription";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import {
replaceImgPathsWithAbsolutePaths,
replacePathsWithAbsolutePaths,
} from "./utils/replacePaths";
import { generateCompletions } from "../../lib/LLM-extraction";
import { fetchAndProcessDocx } from "./utils/docxProcessor";
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
import { Logger } from "../../lib/logger";
import { ScrapeEvents } from "../../lib/scrape-events";
export class WebScraperDataProvider {
private jobId: string;
private bullJobId: string;
private urls: string[] = [""];
private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
private includes: string | string[];
private excludes: string | string[];
private maxCrawledLinks: number;
private maxCrawledDepth: number = 10;
private returnOnlyUrls: boolean;
private limit: number = 10000;
private concurrentRequests: number = 20;
private generateImgAltText: boolean = false;
private ignoreSitemap: boolean = false;
private pageOptions?: PageOptions;
private extractorOptions?: ExtractorOptions;
private replaceAllPathsWithAbsolutePaths?: boolean = false;
private generateImgAltTextModel: "gpt-4-turbo" | "claude-3-opus" =
"gpt-4-turbo";
private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false;
private priority?: number;
private teamId?: string;
authorize(): void {
throw new Error("Method not implemented.");
}
authorizeNango(): Promise<void> {
throw new Error("Method not implemented.");
}
private async convertUrlsToDocuments(
urls: string[],
inProgress?: (progress: Progress) => void,
allHtmls?: string[]
): Promise<Document[]> {
const totalUrls = urls.length;
let processedUrls = 0;
const results: (Document | null)[] = new Array(urls.length).fill(null);
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
const batchUrls = urls.slice(i, i + this.concurrentRequests);
await Promise.all(
batchUrls.map(async (url, index) => {
const existingHTML = allHtmls ? allHtmls[i + index] : "";
const result = await scrapSingleUrl(
this.jobId,
url,
this.pageOptions,
this.extractorOptions,
existingHTML,
this.priority,
this.teamId,
);
processedUrls++;
if (inProgress) {
inProgress({
current: processedUrls,
total: totalUrls,
status: "SCRAPING",
currentDocumentUrl: url,
currentDocument: { ...result, index: processedUrls },
});
}
results[i + index] = result;
})
);
}
return results.filter((result) => result !== null) as Document[];
}
async getDocuments(
useCaching: boolean = false,
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
this.validateInitialUrl();
if (!useCaching) {
return this.processDocumentsWithoutCache(inProgress);
}
return this.processDocumentsWithCache(inProgress);
}
private validateInitialUrl(): void {
if (this.urls[0].trim() === "") {
throw new Error("Url is required");
}
}
/**
* Process documents without cache handling each mode
* @param inProgress inProgress
* @returns documents
*/
private async processDocumentsWithoutCache(
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
switch (this.mode) {
case "crawl":
return this.handleCrawlMode(inProgress);
case "single_urls":
return this.handleSingleUrlsMode(inProgress);
case "sitemap":
return this.handleSitemapMode(inProgress);
default:
return [];
}
}
private async cleanIrrelevantPath(links: string[]) {
return links.filter((link) => {
const normalizedInitialUrl = new URL(this.urls[0]);
const normalizedLink = new URL(link);
// Normalize the hostname to account for www and non-www versions
const initialHostname = normalizedInitialUrl.hostname.replace(
/^www\./,
""
);
const linkHostname = normalizedLink.hostname.replace(/^www\./, "");
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
return (
linkHostname === initialHostname &&
normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
);
});
}
private async handleCrawlMode(
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
let includes: string[];
if (Array.isArray(this.includes)) {
if (this.includes[0] != "") {
includes = this.includes;
}
} else {
includes = this.includes.split(',');
}
let excludes: string[];
if (Array.isArray(this.excludes)) {
if (this.excludes[0] != "") {
excludes = this.excludes;
}
} else {
excludes = this.excludes.split(',');
}
const crawler = new WebCrawler({
jobId: this.jobId,
initialUrl: this.urls[0],
includes,
excludes,
maxCrawledLinks: this.maxCrawledLinks,
maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth),
limit: this.limit,
generateImgAltText: this.generateImgAltText,
allowBackwardCrawling: this.allowBackwardCrawling,
allowExternalContentLinks: this.allowExternalContentLinks,
});
let links = await crawler.start(
inProgress,
this.pageOptions,
{
ignoreSitemap: this.ignoreSitemap,
},
5,
this.limit,
this.maxCrawledDepth
);
let allLinks = links.map((e) => e.url);
const allHtmls = links.map((e) => e.html);
if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(allLinks, inProgress);
}
let documents = [];
// check if fast mode is enabled and there is html inside the links
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
documents = await this.processLinks(allLinks, inProgress, allHtmls);
} else {
documents = await this.processLinks(allLinks, inProgress);
}
return this.cacheAndFinalizeDocuments(documents, allLinks);
}
private async handleSingleUrlsMode(
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
const links = this.urls;
let documents = await this.processLinks(links, inProgress);
return documents;
}
private async handleSitemapMode(
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
links = await this.cleanIrrelevantPath(links);
if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(links, inProgress);
}
let documents = await this.processLinks(links, inProgress);
return this.cacheAndFinalizeDocuments(documents, links);
}
private async returnOnlyUrlsResponse(
links: string[],
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
inProgress?.({
current: links.length,
total: links.length,
status: "COMPLETED",
currentDocumentUrl: this.urls[0],
});
return links.map((url) => ({
content: "",
html: this.pageOptions?.includeHtml ? "" : undefined,
markdown: "",
metadata: { sourceURL: url, pageStatusCode: 200 },
}));
}
private async processLinks(
links: string[],
inProgress?: (progress: Progress) => void,
allHtmls?: string[]
): Promise<Document[]> {
const pdfLinks = links.filter((link) => link.endsWith(".pdf"));
const docLinks = links.filter(
(link) => link.endsWith(".doc") || link.endsWith(".docx")
);
const [pdfDocuments, docxDocuments] = await Promise.all([
this.fetchPdfDocuments(pdfLinks),
this.fetchDocxDocuments(docLinks),
]);
links = links.filter(
(link) => !pdfLinks.includes(link) && !docLinks.includes(link)
);
let [documents, sitemapData] = await Promise.all([
this.convertUrlsToDocuments(links, inProgress, allHtmls),
this.mode === "single_urls" && links.length > 0
? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch(
(error) => {
Logger.debug(`Failed to fetch sitemap data: ${error}`);
return null;
}
)
: Promise.resolve(null),
]);
if (this.mode === "single_urls" && documents.length > 0) {
documents[0].metadata.sitemap = sitemapData ?? undefined;
} else {
documents = await this.getSitemapData(this.urls[0], documents);
}
if (this.pageOptions.includeMarkdown) {
documents = this.applyPathReplacements(documents);
}
if (!this.pageOptions.includeHtml) {
for (let document of documents) {
delete document.html;
}
}
// documents = await this.applyImgAltText(documents);
if (this.mode === "single_urls" && this.pageOptions.includeExtract) {
const extractionMode = this.extractorOptions?.mode ?? "markdown";
const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown";
if (
extractionMode === "llm-extraction" ||
extractionMode === "llm-extraction-from-markdown" ||
extractionMode === "llm-extraction-from-raw-html"
) {
documents = await generateCompletions(
documents,
this.extractorOptions,
completionMode
);
}
}
return documents.concat(pdfDocuments).concat(docxDocuments);
}
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all(
pdfLinks.map(async (pdfLink) => {
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
type: "scrape",
url: pdfLink,
worker: process.env.FLY_MACHINE_ID,
method: "pdf-scrape",
result: null,
});
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
pdfLink,
this.pageOptions.parsePDF
);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: content.length,
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
error: pageError,
response_code: pageStatusCode,
time_taken: Date.now() - timer,
});
return {
content: content,
markdown: content,
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
provider: "web-scraper",
};
})
);
}
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all(
docxLinks.map(async (docxLink) => {
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
type: "scrape",
url: docxLink,
worker: process.env.FLY_MACHINE_ID,
method: "docx-scrape",
result: null,
});
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
docxLink
);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: content.length,
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
error: pageError,
response_code: pageStatusCode,
time_taken: Date.now() - timer,
});
return {
content,
metadata: { sourceURL: docxLink, pageStatusCode, pageError },
provider: "web-scraper",
};
})
);
}
private applyPathReplacements(documents: Document[]): Document[] {
if (this.replaceAllPathsWithAbsolutePaths) {
documents = replacePathsWithAbsolutePaths(documents);
}
return replaceImgPathsWithAbsolutePaths(documents);
}
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
return this.generateImgAltText
? this.generatesImgAltText(documents)
: documents;
}
private async cacheAndFinalizeDocuments(
documents: Document[],
links: string[]
): Promise<Document[]> {
// await this.setCachedDocuments(documents, links);
documents = this.removeChildLinks(documents);
return documents.splice(0, this.limit);
}
private async processDocumentsWithCache(
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
let documents = await this.getCachedDocuments(
this.urls.slice(0, this.limit)
);
if (documents.length < this.limit) {
const newDocuments: Document[] = await this.getDocuments(
false,
inProgress
);
documents = this.mergeNewDocuments(documents, newDocuments);
}
documents = this.filterDocsExcludeInclude(documents);
documents = this.filterDepth(documents);
documents = this.removeChildLinks(documents);
return documents.splice(0, this.limit);
}
private mergeNewDocuments(
existingDocuments: Document[],
newDocuments: Document[]
): Document[] {
newDocuments.forEach((doc) => {
if (
!existingDocuments.some(
(d) =>
this.normalizeUrl(d.metadata.sourceURL) ===
this.normalizeUrl(doc.metadata?.sourceURL)
)
) {
existingDocuments.push(doc);
}
});
return existingDocuments;
}
private filterDocsExcludeInclude(documents: Document[]): Document[] {
return documents.filter((document) => {
const url = new URL(document.metadata.sourceURL);
const path = url.pathname;
if (!Array.isArray(this.excludes)) {
this.excludes = this.excludes.split(',');
}
if (this.excludes.length > 0 && this.excludes[0] !== "") {
// Check if the link should be excluded
if (
this.excludes.some((excludePattern) =>
new RegExp(excludePattern).test(path)
)
) {
return false;
}
}
if (!Array.isArray(this.includes)) {
this.includes = this.includes.split(',');
}
if (this.includes.length > 0 && this.includes[0] !== "") {
// Check if the link matches the include patterns, if any are specified
if (this.includes.length > 0) {
return this.includes.some((includePattern) =>
new RegExp(includePattern).test(path)
);
}
}
return true;
});
}
private normalizeUrl(url: string): string {
if (url.includes("//www.")) {
return url.replace("//www.", "//");
}
return url;
}
private removeChildLinks(documents: Document[]): Document[] {
for (let document of documents) {
if (document?.childrenLinks) delete document.childrenLinks;
}
return documents;
}
async setCachedDocuments(documents: Document[], childrenLinks?: string[]) {
for (const document of documents) {
if (document.content.trim().length === 0) {
continue;
}
const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL);
await setValue(
"web-scraper-cache:" + normalizedUrl,
JSON.stringify({
...document,
childrenLinks: childrenLinks || [],
}),
60 * 60
); // 10 days
}
}
async getCachedDocuments(urls: string[]): Promise<Document[]> {
let documents: Document[] = [];
for (const url of urls) {
const normalizedUrl = this.normalizeUrl(url);
Logger.debug(
"Getting cached document for web-scraper-cache:" + normalizedUrl
);
const cachedDocumentString = await getValue(
"web-scraper-cache:" + normalizedUrl
);
if (cachedDocumentString) {
const cachedDocument = JSON.parse(cachedDocumentString);
documents.push(cachedDocument);
// get children documents
for (const childUrl of cachedDocument.childrenLinks || []) {
const normalizedChildUrl = this.normalizeUrl(childUrl);
const childCachedDocumentString = await getValue(
"web-scraper-cache:" + normalizedChildUrl
);
if (childCachedDocumentString) {
const childCachedDocument = JSON.parse(childCachedDocumentString);
if (
!documents.find(
(doc) =>
doc.metadata.sourceURL ===
childCachedDocument.metadata.sourceURL
)
) {
documents.push(childCachedDocument);
}
}
}
}
}
return documents;
}
setOptions(options: WebScraperOptions): void {
if (!options.urls) {
throw new Error("Urls are required");
}
this.jobId = options.jobId;
this.bullJobId = options.bullJobId;
this.urls = options.urls;
this.mode = options.mode;
this.concurrentRequests = options.concurrentRequests ?? 20;
this.includes = options.crawlerOptions?.includes ?? [];
this.excludes = options.crawlerOptions?.excludes ?? [];
this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10;
this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
this.limit = options.crawlerOptions?.limit ?? 10000;
this.generateImgAltText =
options.crawlerOptions?.generateImgAltText ?? false;
this.pageOptions = {
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
includeHtml: options.pageOptions?.includeHtml ?? false,
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
parsePDF: options.pageOptions?.parsePDF ?? true,
onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
removeTags: options.pageOptions?.removeTags ?? [],
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false,
waitFor: options.pageOptions?.waitFor ?? undefined,
headers: options.pageOptions?.headers ?? undefined,
includeLinks: options.pageOptions?.includeLinks ?? true,
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
screenshot: options.pageOptions?.screenshot ?? false,
useFastMode: options.pageOptions?.useFastMode ?? false,
disableJsDom: options.pageOptions?.disableJsDom ?? false,
atsv: options.pageOptions?.atsv ?? false,
actions: options.pageOptions?.actions ?? undefined,
geolocation: options.pageOptions?.geolocation ?? undefined,
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
removeBase64Images: options.pageOptions?.removeBase64Images ?? true,
mobile: options.pageOptions?.mobile ?? false,
};
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
this.replaceAllPathsWithAbsolutePaths =
options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ??
options.pageOptions?.replaceAllPathsWithAbsolutePaths ??
false;
if (typeof options.crawlerOptions?.excludes === 'string') {
this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== "");
}
if (typeof options.crawlerOptions?.includes === 'string') {
this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== "");
}
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
this.allowBackwardCrawling =
options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks =
options.crawlerOptions?.allowExternalContentLinks ?? false;
this.priority = options.priority;
this.teamId = options.teamId ?? null;
// make sure all urls start with https://
this.urls = this.urls.map((url) => {
if (!url.trim().startsWith("http")) {
return `https://${url}`;
}
return url;
});
}
private async getSitemapData(baseUrl: string, documents: Document[]) {
const sitemapData = await fetchSitemapData(baseUrl);
if (sitemapData) {
for (let i = 0; i < documents.length; i++) {
const docInSitemapData = sitemapData.find(
(data) =>
this.normalizeUrl(data.loc) ===
this.normalizeUrl(documents[i].metadata.sourceURL)
);
if (docInSitemapData) {
let sitemapDocData: Partial<SitemapEntry> = {};
if (docInSitemapData.changefreq) {
sitemapDocData.changefreq = docInSitemapData.changefreq;
}
if (docInSitemapData.priority) {
sitemapDocData.priority = Number(docInSitemapData.priority);
}
if (docInSitemapData.lastmod) {
sitemapDocData.lastmod = docInSitemapData.lastmod;
}
if (Object.keys(sitemapDocData).length !== 0) {
documents[i].metadata.sitemap = sitemapDocData;
}
}
}
}
return documents;
}
private async getSitemapDataForSingleUrl(
baseUrl: string,
url: string,
timeout?: number
) {
const sitemapData = await fetchSitemapData(baseUrl, timeout);
if (sitemapData) {
const docInSitemapData = sitemapData.find(
(data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url)
);
if (docInSitemapData) {
let sitemapDocData: Partial<SitemapEntry> = {};
if (docInSitemapData.changefreq) {
sitemapDocData.changefreq = docInSitemapData.changefreq;
}
if (docInSitemapData.priority) {
sitemapDocData.priority = Number(docInSitemapData.priority);
}
if (docInSitemapData.lastmod) {
sitemapDocData.lastmod = docInSitemapData.lastmod;
}
if (Object.keys(sitemapDocData).length !== 0) {
return sitemapDocData;
}
}
}
return null;
}
generatesImgAltText = async (documents: Document[]): Promise<Document[]> => {
await Promise.all(
documents.map(async (document) => {
const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || [];
await Promise.all(
images.map(async (image: string) => {
let imageUrl = image.match(/\(([^)]+)\)/)[1];
let altText = image.match(/\[(.*?)\]/)[1];
if (
!altText &&
!imageUrl.startsWith("data:image") &&
/\.(png|jpeg|gif|webp)$/.test(imageUrl)
) {
const imageIndex = document.content.indexOf(image);
const contentLength = document.content.length;
let backText = document.content.substring(
imageIndex + image.length,
Math.min(imageIndex + image.length + 1000, contentLength)
);
let frontTextStartIndex = Math.max(imageIndex - 1000, 0);
let frontText = document.content.substring(
frontTextStartIndex,
imageIndex
);
altText = await getImageDescription(
imageUrl,
backText,
frontText,
this.generateImgAltTextModel
);
}
document.content = document.content.replace(
image,
`![${altText}](${imageUrl})`
);
})
);
})
);
return documents;
};
filterDepth(documents: Document[]): Document[] {
return documents.filter((document) => {
const url = new URL(document.metadata.sourceURL);
return getURLDepth(url.toString()) <= this.maxCrawledDepth;
});
}
}

View File

@ -1,89 +0,0 @@
import axios from "axios";
import { logScrape } from "../../../services/logging/scrape_log";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";
/**
* Scrapes a URL with Axios
* @param url The URL to scrape
* @param pageOptions The options for the page
* @returns The scraped content
*/
export async function scrapWithFetch(
url: string,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
const logParams = {
url,
scraper: "fetch",
success: false,
response_code: null,
time_taken_seconds: null,
error_message: null,
html: "",
startTime: Date.now(),
};
try {
const response = await axios.get(url, {
headers: {
"Content-Type": "application/json",
},
timeout: universalTimeout,
transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
});
if (response.status !== 200) {
Logger.debug(
`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`
);
logParams.error_message = response.statusText;
logParams.response_code = response.status;
return {
content: "",
pageStatusCode: response.status,
pageError: response.statusText,
};
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
logParams.success = true;
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
url,
pageOptions?.parsePDF
);
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { content, pageStatusCode: response.status, pageError };
} else {
const text = response.data;
logParams.success = true;
logParams.html = text;
logParams.response_code = response.status;
return {
content: text,
pageStatusCode: response.status,
pageError: null,
};
}
} catch (error) {
if (error.code === "ECONNABORTED") {
logParams.error_message = "Request timed out";
Logger.debug(`⛏️ Axios: Request timed out for ${url}`);
} else {
logParams.error_message = error.message || error;
Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`);
}
return {
content: "",
pageStatusCode: error.response?.status ?? null,
pageError: logParams.error_message,
};
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape(logParams);
}
}

View File

@ -1,230 +0,0 @@
import axios from "axios";
import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities";
import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";
import * as Sentry from "@sentry/node";
import axiosRetry from 'axios-retry';
axiosRetry(axios, { retries: 3 , onRetry:()=>{
console.log("Retrying (fire-engine)...");
}, retryDelay: axiosRetry.exponentialDelay});
/**
* Scrapes a URL with Fire-Engine
* @param url The URL to scrape
* @param waitFor The time to wait for the page to load
* @param screenshot Whether to take a screenshot
* @param fullPageScreenshot Whether to take a full page screenshot
* @param pageOptions The options for the page
* @param headers The headers to send with the request
* @param options The options for the request
* @returns The scraped content
*/
export async function scrapWithFireEngine({
url,
actions,
waitFor = 0,
screenshot = false,
fullPageScreenshot = false,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, removeBase64Images: true, mobile: false },
fireEngineOptions = {},
headers,
options,
priority,
teamId,
}: {
url: string;
actions?: Action[];
waitFor?: number;
screenshot?: boolean;
fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, removeBase64Images?: boolean, mobile?: boolean };
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
priority?: number;
teamId?: string;
}): Promise<FireEngineResponse> {
const logParams = {
url,
scraper: "fire-engine",
success: false,
response_code: null,
time_taken_seconds: null,
error_message: null,
html: "",
startTime: Date.now(),
};
try {
const reqParams = await generateRequestParams(url);
let waitParam = reqParams["params"]?.wait ?? waitFor;
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
let endpoint = "/scrape";
if(options?.endpoint === "request") {
endpoint = "/request";
}
let engine = engineParam; // do we want fireEngineOptions as first choice?
if (pageOptions?.useFastMode) {
fireEngineOptionsParam.engine = "tlsclient";
engine = "tlsclient";
}
Logger.info(
`⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }`
);
// atsv is only available for beta customers
const betaCustomersString = process.env.BETA_CUSTOMERS;
const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : [];
if (pageOptions?.atsv && betaCustomers.includes(teamId)) {
fireEngineOptionsParam.atsv = true;
} else {
pageOptions.atsv = false;
}
const axiosInstance = axios.create({
headers: { "Content-Type": "application/json" }
});
const startTime = Date.now();
const _response = await Sentry.startSpan({
name: "Call to fire-engine"
}, async span => {
return await axiosInstance.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint,
{
url: url,
headers: headers,
wait: waitParam,
screenshot: screenshotParam,
fullPageScreenshot: fullPageScreenshotParam,
disableJsDom: pageOptions?.disableJsDom ?? false,
priority,
engine,
instantReturn: true,
mobile: pageOptions?.mobile ?? false,
...fireEngineOptionsParam,
atsv: pageOptions?.atsv ?? false,
scrollXPaths: pageOptions?.scrollXPaths ?? [],
geolocation: pageOptions?.geolocation,
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
removeBase64Images: pageOptions?.removeBase64Images ?? true,
actions: actions,
},
{
headers: {
"Content-Type": "application/json",
...(Sentry.isInitialized() ? ({
"sentry-trace": Sentry.spanToTraceHeader(span),
"baggage": Sentry.spanToBaggageHeader(span),
}) : {}),
}
}
);
});
const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0);
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
// added 5 seconds to the timeout to account for 'smart wait'
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal + 5000) {
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
}
if (checkStatusResponse.data.processing) {
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
axiosInstance.delete(
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, {
validateStatus: (status) => true
}
).catch((error) => {
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
});
Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`);
logParams.error_message = "Request timed out";
return { html: "", pageStatusCode: null, pageError: "" };
}
if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) {
Logger.debug(
`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}\t ${checkStatusResponse.data.error}`
);
logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error;
logParams.response_code = checkStatusResponse.data?.pageStatusCode;
if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`);
}
const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
return {
html: "",
pageStatusCode,
pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error,
};
}
const contentType = checkStatusResponse.data.responseHeaders?.["content-type"];
if (contentType && contentType.includes("application/pdf")) {
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
url,
pageOptions?.parsePDF
);
logParams.success = true;
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { html: content, pageStatusCode, pageError };
} else {
const data = checkStatusResponse.data;
logParams.success =
(data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
data.pageStatusCode === 404;
logParams.html = data.content ?? "";
logParams.response_code = data.pageStatusCode;
logParams.error_message = data.pageError ?? data.error;
return {
html: data.content ?? "",
screenshots: data.screenshots ?? [data.screenshot] ?? [],
pageStatusCode: data.pageStatusCode,
pageError: data.pageError ?? data.error,
scrapeActionContent: data?.actionContent ?? [],
};
}
} catch (error) {
if (error.code === "ECONNABORTED") {
Logger.debug(`⛏️ Fire-Engine (catch block): Request timed out for ${url}`);
logParams.error_message = "Request timed out";
} else {
Logger.debug(`⛏️ Fire-Engine(catch block): Failed to fetch url: ${url} | Error: ${error}`);
logParams.error_message = error.message || error;
}
return { html: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape(logParams, pageOptions);
}
}

View File

@ -1,111 +0,0 @@
import axios from "axios";
import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";
/**
* Scrapes a URL with Playwright
* @param url The URL to scrape
* @param waitFor The time to wait for the page to load
* @param headers The headers to send with the request
* @param pageOptions The options for the page
* @returns The scraped content
*/
export async function scrapWithPlaywright(
url: string,
waitFor: number = 0,
headers?: Record<string, string>,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
const logParams = {
url,
scraper: "playwright",
success: false,
response_code: null,
time_taken_seconds: null,
error_message: null,
html: "",
startTime: Date.now(),
};
try {
const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor;
const response = await axios.post(
process.env.PLAYWRIGHT_MICROSERVICE_URL,
{
url: url,
wait_after_load: waitParam,
timeout: universalTimeout + waitParam,
headers: headers,
},
{
headers: {
"Content-Type": "application/json",
},
timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
}
);
if (response.status !== 200) {
Logger.debug(
`⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}`
);
logParams.error_message = response.data?.pageError;
logParams.response_code = response.data?.pageStatusCode;
return {
content: "",
pageStatusCode: response.data?.pageStatusCode,
pageError: response.data?.pageError,
};
}
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
logParams.success = true;
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { content, pageStatusCode, pageError };
} else {
const textData = response.data;
try {
const data = JSON.parse(textData);
const html = data.content;
logParams.success = true;
logParams.html = html;
logParams.response_code = data.pageStatusCode;
logParams.error_message = data.pageError;
return {
content: html ?? "",
pageStatusCode: data.pageStatusCode,
pageError: data.pageError,
};
} catch (jsonError) {
logParams.error_message = jsonError.message || jsonError;
Logger.debug(
`⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}`
);
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
}
}
} catch (error) {
if (error.code === "ECONNABORTED") {
logParams.error_message = "Request timed out";
Logger.debug(`⛏️ Playwright: Request timed out for ${url}`);
} else {
logParams.error_message = error.message || error;
Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`);
}
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape(logParams);
}
}

View File

@ -1,92 +0,0 @@
import { logScrape } from "../../../services/logging/scrape_log";
import { generateRequestParams } from "../single_url";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
import { ScrapingBeeClient } from "scrapingbee";
import { Logger } from "../../../lib/logger";
/**
* Scrapes a URL with ScrapingBee
* @param url The URL to scrape
* @param wait_browser The browser event to wait for
* @param timeout The timeout for the scrape
* @param pageOptions The options for the page
* @returns The scraped content
*/
export async function scrapWithScrapingBee(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = universalTimeout,
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
const logParams = {
url,
scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee",
success: false,
response_code: null,
time_taken_seconds: null,
error_message: null,
html: "",
startTime: Date.now(),
};
try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const clientParams = await generateRequestParams(
url,
wait_browser,
timeout
);
const response = await client.get({
...clientParams,
params: {
...clientParams.params,
transparent_status_code: "True",
},
});
Logger.info(
`⛏️ ScrapingBee: Scraping ${url}`
);
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
logParams.success = true;
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { content, pageStatusCode, pageError };
} else {
let text = "";
try {
const decoder = new TextDecoder();
text = decoder.decode(response.data);
logParams.success = true;
} catch (decodeError) {
Logger.debug(
`⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}`
);
logParams.error_message = decodeError.message || decodeError;
}
logParams.response_code = response.status;
logParams.html = text;
logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined;
return {
content: text,
pageStatusCode: response.status,
pageError: response.statusText !== "OK" ? response.statusText : undefined,
};
}
} catch (error) {
Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`);
logParams.error_message = error.message || error;
logParams.response_code = error.response?.status;
return {
content: "",
pageStatusCode: error.response?.status,
pageError: error.response?.statusText,
};
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape(logParams);
}
}

View File

@ -1,506 +0,0 @@
import * as cheerio from "cheerio";
import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv";
import {
Document,
PageOptions,
FireEngineResponse,
ExtractorOptions,
Action,
} from "../../lib/entities";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import { handleCustomScraping } from "./custom/handleCustomScraping";
import { removeUnwantedElements } from "./utils/removeUnwantedElements";
import { scrapWithFetch } from "./scrapers/fetch";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { scrapWithPlaywright } from "./scrapers/playwright";
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
import { extractLinks } from "./utils/utils";
import { Logger } from "../../lib/logger";
import { ScrapeEvents } from "../../lib/scrape-events";
import { clientSideError } from "../../strings";
import { ScrapeActionContent } from "../../lib/entities";
import { removeBase64Images } from "./utils/removeBase64Images";
dotenv.config();
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
export const baseScrapers = [
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
useFireEngine ? "fire-engine" : undefined,
useScrapingBee ? "scrapingBee" : undefined,
useFireEngine ? undefined : "playwright",
useScrapingBee ? "scrapingBeeLoad" : undefined,
"fetch",
].filter(Boolean);
export async function generateRequestParams(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = 15000
): Promise<any> {
const defaultParams = {
url: url,
params: { timeout: timeout, wait_browser: wait_browser },
headers: { "ScrapingService-Request": "TRUE" },
};
try {
const urlKey = new URL(url).hostname.replace(/^www\./, "");
if (urlSpecificParams.hasOwnProperty(urlKey)) {
return { ...defaultParams, ...urlSpecificParams[urlKey] };
} else {
return defaultParams;
}
} catch (error) {
Logger.error(`Error generating URL key: ${error}`);
return defaultParams;
}
}
/**
* Get the order of scrapers to be used for scraping a URL
* If the user doesn't have envs set for a specific scraper, it will be removed from the order.
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL
*/
function getScrapingFallbackOrder(
defaultScraper?: string,
isWaitPresent: boolean = false,
isScreenshotPresent: boolean = false,
isHeadersPresent: boolean = false,
isActionsPresent: boolean = false,
) {
if (isActionsPresent) {
return useFireEngine ? ["fire-engine;chrome-cdp"] : [];
}
const availableScrapers = baseScrapers.filter((scraper) => {
switch (scraper) {
case "scrapingBee":
case "scrapingBeeLoad":
return !!process.env.SCRAPING_BEE_API_KEY;
case "fire-engine":
return !!process.env.FIRE_ENGINE_BETA_URL;
case "fire-engine;chrome-cdp":
return !!process.env.FIRE_ENGINE_BETA_URL;
case "playwright":
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
default:
return true;
}
});
let defaultOrder = [
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
useFireEngine ? "fire-engine" : undefined,
useScrapingBee ? "scrapingBee" : undefined,
useScrapingBee ? "scrapingBeeLoad" : undefined,
useFireEngine ? undefined : "playwright",
"fetch",
].filter(Boolean);
// if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
// defaultOrder = [
// "fire-engine",
// useFireEngine ? undefined : "playwright",
// ...defaultOrder.filter(
// (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
// ),
// ].filter(Boolean);
// }
const filteredDefaultOrder = defaultOrder.filter(
(scraper: (typeof baseScrapers)[number]) =>
availableScrapers.includes(scraper)
);
const uniqueScrapers = new Set(
defaultScraper
? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
: [...filteredDefaultOrder, ...availableScrapers]
);
const scrapersInOrder = Array.from(uniqueScrapers);
return scrapersInOrder as (typeof baseScrapers)[number][];
}
export async function scrapSingleUrl(
jobId: string,
urlToScrap: string,
pageOptions: PageOptions,
extractorOptions?: ExtractorOptions,
existingHtml?: string,
priority?: number,
teamId?: string
): Promise<Document> {
pageOptions = {
includeMarkdown: pageOptions.includeMarkdown ?? true,
includeExtract: pageOptions.includeExtract ?? false,
onlyMainContent: pageOptions.onlyMainContent ?? false,
includeHtml: pageOptions.includeHtml ?? false,
includeRawHtml: pageOptions.includeRawHtml ?? false,
waitFor: pageOptions.waitFor ?? undefined,
screenshot: pageOptions.screenshot ?? false,
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
headers: pageOptions.headers ?? undefined,
includeLinks: pageOptions.includeLinks ?? true,
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
parsePDF: pageOptions.parsePDF ?? true,
removeTags: pageOptions.removeTags ?? [],
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
useFastMode: pageOptions.useFastMode ?? false,
disableJsDom: pageOptions.disableJsDom ?? false,
atsv: pageOptions.atsv ?? false,
actions: pageOptions.actions ?? undefined,
geolocation: pageOptions.geolocation ?? undefined,
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
removeBase64Images: pageOptions.removeBase64Images ?? true,
mobile: pageOptions.mobile ?? false,
}
if (extractorOptions) {
extractorOptions = {
mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
}
}
if (!existingHtml) {
existingHtml = "";
}
urlToScrap = urlToScrap.trim();
const attemptScraping = async (
url: string,
method: (typeof baseScrapers)[number]
) => {
let scraperResponse: {
text: string;
screenshot: string;
actions?: {
screenshots?: string[];
scrapes?: ScrapeActionContent[];
};
metadata: { pageStatusCode?: number; pageError?: string | null };
} = { text: "", screenshot: "", metadata: {} };
let screenshot = "";
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(jobId, {
type: "scrape",
url,
worker: process.env.FLY_MACHINE_ID,
method,
result: null,
});
switch (method) {
case "fire-engine":
case "fire-engine;chrome-cdp":
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
if (method === "fire-engine;chrome-cdp") {
engine = "chrome-cdp";
}
if (process.env.FIRE_ENGINE_BETA_URL) {
const processedActions: Action[] = pageOptions.actions?.flatMap((action: Action, index: number, array: Action[]) => {
if (action.type === "click" || action.type === "write" || action.type === "press") {
const result: Action[] = [];
// Don't add a wait if the previous action is a wait
// if (index === 0 || array[index - 1].type !== "wait") {
// result.push({ type: "wait", milliseconds: 1200 } as Action);
// }
// Fire-engine now handles wait times automatically, leaving the code here for now
result.push(action);
// Don't add a wait if the next action is a wait
// if (index === array.length - 1 || array[index + 1].type !== "wait") {
// result.push({ type: "wait", milliseconds: 1200 } as Action);
// }
return result;
}
return [action as Action];
}) ?? [] as Action[];
const response = await scrapWithFireEngine({
url,
...(engine === "chrome-cdp" ? ({
actions: [
...(pageOptions.waitFor ? [{
type: "wait" as const,
milliseconds: pageOptions.waitFor,
}] : []),
...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{
type: "screenshot" as const,
fullPage: !!pageOptions.fullPageScreenshot,
}] : []),
...processedActions,
],
}) : ({
waitFor: pageOptions.waitFor,
screenshot: pageOptions.screenshot,
fullPageScreenshot: pageOptions.fullPageScreenshot,
})),
pageOptions: pageOptions,
headers: pageOptions.headers,
fireEngineOptions: {
engine: engine,
atsv: pageOptions.atsv,
disableJsDom: pageOptions.disableJsDom,
},
priority,
teamId,
});
scraperResponse.text = response.html;
if (pageOptions.screenshot || pageOptions.fullPageScreenshot) {
scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? "";
}
if (pageOptions.actions) {
scraperResponse.actions = {
screenshots: response.screenshots ?? [],
scrapes: response.scrapeActionContent ?? [],
};
}
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
}
break;
case "scrapingBee":
if (process.env.SCRAPING_BEE_API_KEY) {
const response = await scrapWithScrapingBee(
url,
"domcontentloaded",
pageOptions.fallback === false ? 7000 : 15000
);
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
}
break;
case "playwright":
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
const response = await scrapWithPlaywright(
url,
pageOptions.waitFor,
pageOptions.headers
);
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
}
break;
case "scrapingBeeLoad":
if (process.env.SCRAPING_BEE_API_KEY) {
const response = await scrapWithScrapingBee(url, "networkidle2");
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
}
break;
case "fetch":
const response = await scrapWithFetch(url);
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
break;
}
let customScrapedContent: FireEngineResponse | null = null;
// Check for custom scraping conditions
const customScraperResult = await handleCustomScraping(
scraperResponse.text,
url
);
if (customScraperResult) {
switch (customScraperResult.scraper) {
case "fire-engine":
customScrapedContent = await scrapWithFireEngine({
url: customScraperResult.url,
actions: customScraperResult.waitAfterLoad ? ([
{
type: "wait",
milliseconds: customScraperResult.waitAfterLoad,
}
]) : ([]),
pageOptions: customScraperResult.pageOptions,
});
break;
case "pdf":
const { content, pageStatusCode, pageError } =
await fetchAndProcessPdf(
customScraperResult.url,
pageOptions?.parsePDF
);
customScrapedContent = {
html: content,
pageStatusCode,
pageError,
};
break;
}
}
if (customScrapedContent) {
scraperResponse.text = customScrapedContent.html;
}
//* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
let text = await parseMarkdown(cleanedHtml);
if (pageOptions.removeBase64Images) {
text = await removeBase64Images(text);
}
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: scraperResponse.text.length,
success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
error: scraperResponse.metadata.pageError,
response_code: scraperResponse.metadata.pageStatusCode,
time_taken: Date.now() - timer,
});
return {
text,
html: cleanedHtml,
rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot,
actions: scraperResponse.actions,
pageStatusCode: scraperResponse.metadata.pageStatusCode,
pageError: scraperResponse.metadata.pageError || undefined,
};
};
let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = {
text: "",
html: "",
rawHtml: "",
screenshot: "",
actions: undefined,
pageStatusCode: 200,
pageError: undefined,
};
try {
let urlKey = urlToScrap;
try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
} catch (error) {
Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
}
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
const scrapersInOrder = getScrapingFallbackOrder(
defaultScraper,
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
pageOptions && pageOptions.headers && pageOptions.headers !== undefined,
pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0,
);
for (const scraper of scrapersInOrder) {
// If exists text coming from crawler, use it
if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) {
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
text = await parseMarkdown(cleanedHtml);
html = cleanedHtml;
break;
}
const attempt = await attemptScraping(urlToScrap, scraper);
text = attempt.text ?? "";
html = attempt.html ?? "";
rawHtml = attempt.rawHtml ?? "";
screenshot = attempt.screenshot ?? "";
actions = attempt.actions ?? undefined;
if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode;
}
if (attempt.pageError && (attempt.pageStatusCode >= 400 || scrapersInOrder.indexOf(scraper) === scrapersInOrder.length - 1)) { // force pageError if it's the last scraper and it failed too
pageError = attempt.pageError;
if (attempt.pageStatusCode < 400 || !attempt.pageStatusCode) {
pageStatusCode = 500;
}
} else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) {
pageError = undefined;
}
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
break;
}
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400)) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`);
break;
}
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
// if (nextScraperIndex < scrapersInOrder.length) {
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
// }
}
if (!text) {
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
}
const soup = cheerio.load(rawHtml);
const metadata = extractMetadata(soup, urlToScrap);
let linksOnPage: string[] | undefined;
if (pageOptions.includeLinks) {
linksOnPage = extractLinks(rawHtml, urlToScrap);
}
let document: Document = {
content: text,
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
? rawHtml
: undefined,
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
actions,
metadata: {
...metadata,
...(screenshot && screenshot.length > 0 ? ({
screenshot,
}) : {}),
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError,
},
};
return document;
} catch (error) {
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
ScrapeEvents.insert(jobId, {
type: "error",
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
stack: error.stack,
});
return {
content: "",
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
html: "",
linksOnPage: pageOptions.includeLinks ? [] : undefined,
metadata: {
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
pageError: pageError,
},
} as Document;
}
}

View File

@ -1,9 +1,10 @@
import axios from "axios"; import axios from "axios";
import { axiosTimeout } from "../../lib/timeout"; import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js"; import { parseStringPromise } from "xml2js";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { WebCrawler } from "./crawler"; import { WebCrawler } from "./crawler";
import { Logger } from "../../lib/logger"; import { logger } from "../../lib/logger";
import { scrapeURL } from "../scrapeURL";
import { scrapeOptions } from "../../controllers/v1/types";
export async function getLinksFromSitemap( export async function getLinksFromSitemap(
{ {
@ -17,17 +18,20 @@ export async function getLinksFromSitemap(
} }
): Promise<string[]> { ): Promise<string[]> {
try { try {
let content: string; let content: string = "";
try { try {
if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') { if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data; content = response.data;
} else if (mode === 'fire-engine') { } else if (mode === 'fire-engine') {
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"playwright" } }); const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });;
content = response.html; if (!response.success) {
throw response.error;
}
content = response.document.rawHtml!;
} }
} catch (error) { } catch (error) {
Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`); logger.error(`Request failed for ${sitemapUrl}: ${error.message}`);
return allUrls; return allUrls;
} }
@ -47,7 +51,7 @@ export async function getLinksFromSitemap(
allUrls.push(...validUrls); allUrls.push(...validUrls);
} }
} catch (error) { } catch (error) {
Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`); logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`);
} }
return allUrls; return allUrls;

View File

@ -1,15 +0,0 @@
import * as docxProcessor from "../docxProcessor";
describe("DOCX Processing Module - Integration Test", () => {
it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => {
delete process.env.LLAMAPARSE_API_KEY;
const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx(
"https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx"
);
expect(content.trim()).toContain(
"SERIES A PREFERRED STOCK PURCHASE AGREEMENT"
);
expect(pageStatusCode).toBe(200);
expect(pageError).toBeUndefined();
});
});

View File

@ -1,128 +0,0 @@
import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
import cheerio from 'cheerio';
describe('parseTablesToMarkdown', () => {
it('converts a simple HTML table to Markdown', async () => {
const html = `
<table>
<tr><th>Header 1</th><th>Header 2</th></tr>
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
</table>
`;
const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
const markdown = await parseTablesToMarkdown(html);
expect(markdown).toBe(expectedMarkdown);
});
it('converts a table with a single row to Markdown', async () => {
const html = `
<table>
<tr><th>Header 1</th><th>Header 2</th></tr>
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
</table>
`;
const expectedMarkdown = `<div>| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |</div>`;
const markdown = await parseTablesToMarkdown(html);
expect(markdown).toBe(expectedMarkdown);
});
it('converts a table with a single column to Markdown', async () => {
const html = `
<table>
<tr><th>Header 1</th></tr>
<tr><td>Row 1 Col 1</td></tr>
<tr><td>Row 2 Col 1</td></tr>
</table>
`;
const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |</div>`;
const markdown = await parseTablesToMarkdown(html);
expect(markdown).toBe(expectedMarkdown);
});
it('converts a table with a single cell to Markdown', async () => {
const html = `
<table>
<tr><th>Header 1</th></tr>
<tr><td>Row 1 Col 1</td></tr>
</table>
`;
const expectedMarkdown = `<div>| Header 1 |\n| --- |\n| Row 1 Col 1 |</div>`;
const markdown = await parseTablesToMarkdown(html);
expect(markdown).toBe(expectedMarkdown);
});
it('converts a table with no header to Markdown', async () => {
const html = `
<table>
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
</table>
`;
const expectedMarkdown = `<div>| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |</div>`;
const markdown = await parseTablesToMarkdown(html);
expect(markdown).toBe(expectedMarkdown);
});
it('converts a table with no rows to Markdown', async () => {
const html = `
<table>
</table>
`;
const expectedMarkdown = `<div></div>`;
const markdown = await parseTablesToMarkdown(html);
expect(markdown).toBe(expectedMarkdown);
});
it('converts a table with no cells to Markdown', async () => {
const html = `
<table>
<tr></tr>
</table>
`;
const expectedMarkdown = `<div></div>`;
const markdown = await parseTablesToMarkdown(html);
expect(markdown).toBe(expectedMarkdown);
});
it('converts a table with no columns to Markdown', async () => {
const html = `
<table>
<tr><th></th></tr>
</table>
`;
const expectedMarkdown = `<div></div>`;
const markdown = await parseTablesToMarkdown(html);
expect(markdown).toBe(expectedMarkdown);
});
it('converts a table with no table to Markdown', async () => {
const html = ``;
const expectedMarkdown = ``;
const markdown = await parseTablesToMarkdown(html);
expect(markdown).toBe(expectedMarkdown);
});
it('converts a table inside of a bunch of html noise', async () => {
const html = `
<div>
<p>Some text before</p>
<table>
<tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr>
<tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr>
</table>
<p>Some text after</p>
</div>
`;
const expectedMarkdown = `<div>
<p>Some text before</p>
<div>| Row 1 Col 1 | Row 1 Col 2 |
| Row 2 Col 1 | Row 2 Col 2 |</div>
<p>Some text after</p>
</div>`;
const markdown = await parseTablesToMarkdown(html);
expect(markdown).toBe(expectedMarkdown);
});
});

View File

@ -1,19 +0,0 @@
import * as pdfProcessor from '../pdfProcessor';
describe('PDF Processing Module - Integration Test', () => {
it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
delete process.env.LLAMAPARSE_API_KEY;
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true);
expect(content.trim()).toEqual("Dummy PDF file");
expect(pageStatusCode).toEqual(200);
expect(pageError).toBeUndefined();
});
it('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => {
const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/astro-ph/9301001.pdf', false);
expect(pageStatusCode).toBe(200);
expect(pageError).toBeUndefined();
expect(content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj');
}, 60000); // 60 seconds
});

File diff suppressed because one or more lines are too long

View File

@ -1,127 +0,0 @@
import { Document } from "../../../../lib/entities";
import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths";
describe('replacePaths', () => {
describe('replacePathsWithAbsolutePaths', () => {
it('should replace relative paths with absolute paths', () => {
const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'This is a [link](/path/to/resource).',
markdown: 'This is a [link](/path/to/resource).'
}];
const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'This is a [link](https://example.com/path/to/resource).',
markdown: 'This is a [link](https://example.com/path/to/resource).'
}];
const result = replacePathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
it('should not alter absolute URLs', () => {
const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'This is an [external link](https://external.com/path).',
markdown: 'This is an [external link](https://external.com/path).'
}];
const result = replacePathsWithAbsolutePaths(documents);
expect(result).toEqual(documents); // Expect no change
});
it('should not alter data URLs for images', () => {
const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'This is an image: ![alt text]().',
markdown: 'This is an image: ![alt text]().'
}];
const result = replacePathsWithAbsolutePaths(documents);
expect(result).toEqual(documents); // Expect no change
});
it('should handle multiple links and images correctly', () => {
const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'Here are two links: [link1](/path1) and [link2](/path2).',
markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
}];
const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
}];
const result = replacePathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
it('should correctly handle a mix of absolute and relative paths', () => {
const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().',
markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().'
}];
const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().',
markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().'
}];
const result = replacePathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
});
describe('replaceImgPathsWithAbsolutePaths', () => {
it('should replace relative image paths with absolute paths', () => {
const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'Here is an image: ![alt text](/path/to/image.jpg).',
markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
}];
const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
}];
const result = replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
it('should not alter data:image URLs', () => {
const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'An image with a data URL: ![alt text]().',
markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
}];
const result = replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(documents); // Expect no change
});
it('should handle multiple images with a mix of data and relative URLs', () => {
const documents: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).',
markdown: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).'
}];
const expectedDocuments: Document[] = [{
metadata: { sourceURL: 'https://example.com' },
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).',
markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).'
}];
const result = replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
});
});

View File

@ -1,66 +0,0 @@
import { Logger } from '../../../../lib/logger';
import { isUrlBlocked } from '../blocklist';
describe('isUrlBlocked', () => {
it('should return true for blocked social media URLs', () => {
const blockedUrls = [
'https://www.facebook.com',
'https://twitter.com/someuser',
'https://instagram.com/someuser',
'https://www.linkedin.com/in/someuser',
'https://snapchat.com/someuser',
'https://tiktok.com/@someuser',
'https://reddit.com/r/somesubreddit',
'https://flickr.com/photos/someuser',
'https://whatsapp.com/someuser',
'https://wechat.com/someuser',
'https://telegram.org/someuser',
];
blockedUrls.forEach(url => {
if (!isUrlBlocked(url)) {
Logger.debug(`URL not blocked: ${url}`);
}
expect(isUrlBlocked(url)).toBe(true);
});
});
it('should return false for URLs containing allowed keywords', () => {
const allowedUrls = [
'https://www.facebook.com/privacy',
'https://twitter.com/terms',
'https://instagram.com/legal',
'https://www.linkedin.com/help',
'https://pinterest.com/about',
'https://snapchat.com/support',
'https://tiktok.com/contact',
'https://reddit.com/user-agreement',
'https://tumblr.com/policy',
'https://flickr.com/blog',
'https://whatsapp.com/press',
'https://wechat.com/careers',
'https://telegram.org/conditions',
'https://wix.com/careers',
];
allowedUrls.forEach(url => {
expect(isUrlBlocked(url)).toBe(false);
});
});
it('should return false for non-blocked URLs', () => {
const nonBlockedUrls = [
'https://www.example.com',
'https://www.somewebsite.org',
'https://subdomain.example.com',
'firecrawl.dev',
'amazon.com',
'wix.com',
'https://wix.com'
];
nonBlockedUrls.forEach(url => {
expect(isUrlBlocked(url)).toBe(false);
});
});
});

View File

@ -1,4 +1,4 @@
import { Logger } from "../../../lib/logger"; import { logger } from "../../../lib/logger";
const socialMediaBlocklist = [ const socialMediaBlocklist = [
'facebook.com', 'facebook.com',
@ -68,7 +68,7 @@ export function isUrlBlocked(url: string): boolean {
return isBlocked; return isBlocked;
} catch (e) { } catch (e) {
// If an error occurs (e.g., invalid URL), return false // If an error occurs (e.g., invalid URL), return false
Logger.error(`Error parsing the following URL: ${url}`); logger.error(`Error parsing the following URL: ${url}`);
return false; return false;
} }
} }

View File

@ -1,198 +0,0 @@
export const urlSpecificParams = {
"support.greenpay.me":{
defaultScraper: "fire-engine",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 2000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"docs.pdw.co":{
defaultScraper: "fire-engine",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 3000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"developers.notion.com":{
defaultScraper: "fire-engine",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 2000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"docs2.hubitat.com":{
defaultScraper: "fire-engine",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 2000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"scrapethissite.com":{
defaultScraper: "fetch",
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"rsseau.fr":{
defaultScraper: "fetch",
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"help.salesforce.com":{
defaultScraper: "fire-engine",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 2000,
},
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
referer: "https://www.google.com/",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
},
},
"ir.veeva.com":{
defaultScraper: "fire-engine",
},
"eonhealth.com":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
mobileProxy: true,
method: "get",
engine: "request",
},
},
},
"notion.com":{
defaultScraper: "fire-engine",
params: {
wait_browser: "networkidle2",
block_resources: false,
wait: 2000,
engine: "playwright",
}
},
"developer.apple.com":{
defaultScraper: "fire-engine",
params:{
engine: "playwright",
wait: 2000,
fireEngineOptions: {
blockMedia: false,
}
},
},
"amazon.com":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "chrome-cdp",
},
},
},
"digikey.com":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "tlsclient",
},
},
},
"zoopla.co.uk":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "chrome-cdp",
},
},
},
"lorealparis.hu":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "tlsclient",
},
},
}
};

View File

@ -1,79 +0,0 @@
import axios from "axios";
import fs from "fs";
import { createWriteStream } from "node:fs";
import path from "path";
import os from "os";
import mammoth from "mammoth";
import { Logger } from "../../../lib/logger";
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
let tempFilePath = '';
let pageStatusCode = 200;
let pageError = '';
let content = '';
try {
const downloadResult = await downloadDocx(url);
tempFilePath = downloadResult.tempFilePath;
pageStatusCode = downloadResult.pageStatusCode;
pageError = downloadResult.pageError;
content = await processDocxToText(tempFilePath);
} catch (error) {
Logger.error(`Failed to fetch and process DOCX: ${error.message}`);
pageStatusCode = 500;
pageError = error.message;
content = '';
} finally {
if (tempFilePath) {
fs.unlinkSync(tempFilePath); // Clean up the temporary file
}
}
return { content, pageStatusCode, pageError };
}
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
try {
const response = await axios({
url,
method: "GET",
responseType: "stream",
});
const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`);
const writer = createWriteStream(tempFilePath);
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
writer.on("error", () => {
Logger.error('Failed to write DOCX file to disk');
reject(new Error('Failed to write DOCX file to disk'));
});
});
} catch (error) {
Logger.error(`Failed to download DOCX: ${error.message}`);
return { tempFilePath: "", pageStatusCode: 500, pageError: error.message };
}
}
export async function processDocxToText(filePath: string): Promise<string> {
try {
const content = await extractTextFromDocx(filePath);
return content;
} catch (error) {
Logger.error(`Failed to process DOCX to text: ${error.message}`);
return "";
}
}
async function extractTextFromDocx(filePath: string): Promise<string> {
try {
const result = await mammoth.extractRawText({ path: filePath });
return result.value;
} catch (error) {
Logger.error(`Failed to extract text from DOCX: ${error.message}`);
return "";
}
}

View File

@ -1,42 +0,0 @@
export const excludeNonMainTags = [
"header",
"footer",
"nav",
"aside",
".top",
".navbar",
".footer",
".bottom",
"#footer",
".sidebar",
".side",
".aside",
"#sidebar",
".modal",
".popup",
"#modal",
".overlay",
".ad",
".ads",
".advert",
"#ad",
".lang-selector",
".language",
"#language-selector",
".social",
".social-media",
".social-links",
"#social",
".menu",
".navigation",
"#nav",
".breadcrumbs",
"#breadcrumbs",
"#search-form",
".search",
"#search",
".share",
"#share",
".cookie",
"#cookie"
];

View File

@ -1,89 +0,0 @@
import Anthropic from '@anthropic-ai/sdk';
import axios from 'axios';
import { Logger } from '../../../lib/logger';
export async function getImageDescription(
imageUrl: string,
backText: string,
frontText: string,
model: string = "gpt-4-turbo"
): Promise<string> {
try {
const prompt = "What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " +
backText +
" and the following text: " +
frontText +
". Be super concise."
switch (model) {
case 'claude-3-opus': {
if (!process.env.ANTHROPIC_API_KEY) {
throw new Error("No Anthropic API key provided");
}
const imageRequest = await axios.get(imageUrl, { responseType: 'arraybuffer' });
const imageMediaType = 'image/png';
const imageData = Buffer.from(imageRequest.data, 'binary').toString('base64');
const anthropic = new Anthropic();
const response = await anthropic.messages.create({
model: "claude-3-opus-20240229",
max_tokens: 1024,
messages: [
{
role: "user",
content: [
{
type: "image",
source: {
type: "base64",
media_type: imageMediaType,
data: imageData,
},
},
{
type: "text",
text: prompt
}
],
}
]
});
return response[0].content.text;
}
default: {
if (!process.env.OPENAI_API_KEY) {
throw new Error("No OpenAI API key provided");
}
const { OpenAI } = require("openai");
const openai = new OpenAI();
const response = await openai.chat.completions.create({
model: "gpt-4-turbo",
messages: [
{
role: "user",
content: [
{
type: "text",
text: prompt,
},
{
type: "image_url",
image_url: {
url: imageUrl,
},
},
],
},
],
});
return response.choices[0].message.content;
}
}
} catch (error) {
Logger.error(`Error generating image alt text: ${error}`);
return "";
}
}

View File

@ -1,185 +0,0 @@
import { CheerioAPI } from "cheerio";
import { Logger } from "../../../lib/logger";
interface Metadata {
title?: string;
description?: string;
language?: string;
keywords?: string;
robots?: string;
ogTitle?: string;
ogDescription?: string;
ogUrl?: string;
ogImage?: string;
ogAudio?: string;
ogDeterminer?: string;
ogLocale?: string;
ogLocaleAlternate?: string[];
ogSiteName?: string;
ogVideo?: string;
dctermsCreated?: string;
dcDateCreated?: string;
dcDate?: string;
dctermsType?: string;
dcType?: string;
dctermsAudience?: string;
dctermsSubject?: string;
dcSubject?: string;
dcDescription?: string;
dctermsKeywords?: string;
modifiedTime?: string;
publishedTime?: string;
articleTag?: string;
articleSection?: string;
sourceURL?: string;
pageStatusCode?: number;
pageError?: string;
[key: string]: string | string[] | number | undefined;
}
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
let title: string | null = null;
let description: string | null = null;
let language: string | null = null;
let keywords: string | null = null;
let robots: string | null = null;
let ogTitle: string | null = null;
let ogDescription: string | null = null;
let ogUrl: string | null = null;
let ogImage: string | null = null;
let ogAudio: string | null = null;
let ogDeterminer: string | null = null;
let ogLocale: string | null = null;
let ogLocaleAlternate: string[] | null = null;
let ogSiteName: string | null = null;
let ogVideo: string | null = null;
let dctermsCreated: string | null = null;
let dcDateCreated: string | null = null;
let dcDate: string | null = null;
let dctermsType: string | null = null;
let dcType: string | null = null;
let dctermsAudience: string | null = null;
let dctermsSubject: string | null = null;
let dcSubject: string | null = null;
let dcDescription: string | null = null;
let dctermsKeywords: string | null = null;
let modifiedTime: string | null = null;
let publishedTime: string | null = null;
let articleTag: string | null = null;
let articleSection: string | null = null;
let sourceURL: string | null = null;
let pageStatusCode: number | null = null;
let pageError: string | null = null;
const customMetadata: Record<string, string | string[]> = {};
try {
// TODO: remove this as it is redundant with the below implementation
title = soup("title").text() || null;
description = soup('meta[name="description"]').attr("content") || null;
language = soup("html").attr("lang") || null;
keywords = soup('meta[name="keywords"]').attr("content") || null;
robots = soup('meta[name="robots"]').attr("content") || null;
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
ogDescription =
soup('meta[property="og:description"]').attr("content") || null;
ogUrl = soup('meta[property="og:url"]').attr("content") || null;
ogImage = soup('meta[property="og:image"]').attr("content") || null;
ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
ogDeterminer =
soup('meta[property="og:determiner"]').attr("content") || null;
ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
ogLocaleAlternate =
soup('meta[property="og:locale:alternate"]')
.map((i, el) => soup(el).attr("content"))
.get() || null;
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
ogVideo = soup('meta[property="og:video"]').attr("content") || null;
articleSection =
soup('meta[name="article:section"]').attr("content") || null;
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
publishedTime =
soup('meta[property="article:published_time"]').attr("content") || null;
modifiedTime =
soup('meta[property="article:modified_time"]').attr("content") || null;
dctermsKeywords =
soup('meta[name="dcterms.keywords"]').attr("content") || null;
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
dctermsSubject =
soup('meta[name="dcterms.subject"]').attr("content") || null;
dctermsAudience =
soup('meta[name="dcterms.audience"]').attr("content") || null;
dcType = soup('meta[name="dc.type"]').attr("content") || null;
dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
dcDate = soup('meta[name="dc.date"]').attr("content") || null;
dcDateCreated =
soup('meta[name="dc.date.created"]').attr("content") || null;
dctermsCreated =
soup('meta[name="dcterms.created"]').attr("content") || null;
try {
// Extract all meta tags for custom metadata
soup("meta").each((i, elem) => {
try {
const name = soup(elem).attr("name") || soup(elem).attr("property");
const content = soup(elem).attr("content");
if (name && content) {
if (customMetadata[name] === undefined) {
customMetadata[name] = content;
} else if (Array.isArray(customMetadata[name])) {
(customMetadata[name] as string[]).push(content);
} else {
customMetadata[name] = [customMetadata[name] as string, content];
}
}
} catch (error) {
Logger.error(`Error extracting custom metadata (in): ${error}`);
}
});
} catch (error) {
Logger.error(`Error extracting custom metadata: ${error}`);
}
} catch (error) {
Logger.error(`Error extracting metadata: ${error}`);
}
return {
...(title ? { title } : {}),
...(description ? { description } : {}),
...(language ? { language } : {}),
...(keywords ? { keywords } : {}),
...(robots ? { robots } : {}),
...(ogTitle ? { ogTitle } : {}),
...(ogDescription ? { ogDescription } : {}),
...(ogUrl ? { ogUrl } : {}),
...(ogImage ? { ogImage } : {}),
...(ogAudio ? { ogAudio } : {}),
...(ogDeterminer ? { ogDeterminer } : {}),
...(ogLocale ? { ogLocale } : {}),
...(ogLocaleAlternate ? { ogLocaleAlternate } : {}),
...(ogSiteName ? { ogSiteName } : {}),
...(ogVideo ? { ogVideo } : {}),
...(dctermsCreated ? { dctermsCreated } : {}),
...(dcDateCreated ? { dcDateCreated } : {}),
...(dcDate ? { dcDate } : {}),
...(dctermsType ? { dctermsType } : {}),
...(dcType ? { dcType } : {}),
...(dctermsAudience ? { dctermsAudience } : {}),
...(dctermsSubject ? { dctermsSubject } : {}),
...(dcSubject ? { dcSubject } : {}),
...(dcDescription ? { dcDescription } : {}),
...(dctermsKeywords ? { dctermsKeywords } : {}),
...(modifiedTime ? { modifiedTime } : {}),
...(publishedTime ? { publishedTime } : {}),
...(articleTag ? { articleTag } : {}),
...(articleSection ? { articleSection } : {}),
...(sourceURL ? { sourceURL } : {}),
...(pageStatusCode ? { pageStatusCode } : {}),
...(pageError ? { pageError } : {}),
...customMetadata,
};
}

View File

@ -1,74 +0,0 @@
import cheerio, { CheerioAPI } from "cheerio";
interface Replacement {
start: number;
end: number;
markdownTable: string;
}
export const parseTablesToMarkdown = async (html: string): Promise<string> => {
const soup: CheerioAPI = cheerio.load(html, {
xmlMode: true,
withStartIndices: true,
withEndIndices: true
});
let tables = soup("table");
let replacements: Replacement[] = [];
if (tables.length) {
tables.each((_, tableElement) => {
const start: number = tableElement.startIndex;
const end: number = tableElement.endIndex + 1; // Include the closing tag properly
let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement));
const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0;
if (isTableEmpty) {
markdownTable = '';
}
replacements.push({ start, end, markdownTable });
});
}
replacements.sort((a, b) => b.start - a.start);
let modifiedHtml: string = html;
replacements.forEach(({ start, end, markdownTable }) => {
modifiedHtml = modifiedHtml.slice(0, start) + `<div>${markdownTable}</div>` + modifiedHtml.slice(end);
});
return modifiedHtml.trim();
};
export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => {
let rows: string[] = [];
let headerRowFound: boolean = false;
tableSoup("tr").each((i, tr) => {
const cells: string = tableSoup(tr).find("th, td").map((_, cell) => {
let cellText: string = tableSoup(cell).text().trim();
if (tableSoup(cell).is("th") && !headerRowFound) {
headerRowFound = true;
}
return ` ${cellText} |`;
}).get().join("");
if (cells) {
rows.push(`|${cells}`);
}
if (headerRowFound && i === 0) { // Header row
rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length));
}
});
return rows.join('\n').trim();
};
export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string {
const cells: string = rowSoup("td, th").map((_, cell) => {
let cellText: string = rowSoup(cell).text().trim();
return ` ${cellText} |`;
}).get().join("");
return `|${cells}`;
};
export function createMarkdownDividerRow(cellCount: number): string {
return '| ' + Array(cellCount).fill('---').join(' | ') + ' |';
}

View File

@ -1,140 +0,0 @@
import axios, { AxiosResponse } from "axios";
import fs from "fs/promises";
import { createReadStream, createWriteStream } from "node:fs";
import FormData from "form-data";
import dotenv from "dotenv";
import pdf from "pdf-parse";
import path from "path";
import os from "os";
import { axiosTimeout } from "../../../lib/timeout";
import { Logger } from "../../../lib/logger";
dotenv.config();
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try {
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
const content = await processPdfToText(tempFilePath, parsePDF);
await fs.unlink(tempFilePath); // Clean up the temporary file
return { content, pageStatusCode, pageError };
} catch (error) {
Logger.error(`Failed to fetch and process PDF: ${error.message}`);
return { content: "", pageStatusCode: 500, pageError: error.message };
}
}
async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
const response = await axios({
url,
method: "GET",
responseType: "stream",
});
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
const writer = createWriteStream(tempFilePath);
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
writer.on("error", reject);
});
}
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
let content = "";
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
Logger.debug("Processing pdf document w/ LlamaIndex");
const apiKey = process.env.LLAMAPARSE_API_KEY;
const headers = {
Authorization: `Bearer ${apiKey}`,
};
const base_url = "https://api.cloud.llamaindex.ai/api/parsing";
const fileType2 = "application/pdf";
try {
const formData = new FormData();
formData.append("file", createReadStream(filePath), {
filename: filePath,
contentType: fileType2,
});
const uploadUrl = `${base_url}/upload`;
const uploadResponse = await axios.post(uploadUrl, formData, {
headers: {
...headers,
...formData.getHeaders(),
},
});
const jobId = uploadResponse.data.id;
const resultType = "text";
const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`;
let resultResponse: AxiosResponse;
let attempt = 0;
const maxAttempts = 10; // Maximum number of attempts
let resultAvailable = false;
while (attempt < maxAttempts && !resultAvailable) {
try {
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
if (resultResponse.status === 200) {
resultAvailable = true; // Exit condition met
} else {
// If the status code is not 200, increment the attempt counter and wait
attempt++;
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
}
} catch (error) {
Logger.debug("Error fetching result w/ LlamaIndex");
attempt++;
if (attempt >= maxAttempts) {
Logger.error("Max attempts reached, unable to fetch result.");
break; // Exit the loop if max attempts are reached
}
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
// You may want to handle specific errors differently
}
}
if (!resultAvailable) {
try {
content = await processPdf(filePath);
} catch (error) {
Logger.error(`Failed to process PDF: ${error}`);
content = "";
}
}
content = resultResponse.data[resultType];
} catch (error) {
Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
content = await processPdf(filePath);
}
} else if (parsePDF) {
try {
content = await processPdf(filePath);
} catch (error) {
Logger.error(`Failed to process PDF: ${error}`);
content = "";
}
} else {
try {
content = await fs.readFile(filePath, "utf-8");
} catch (error) {
Logger.error(`Failed to read PDF file: ${error}`);
content = "";
}
}
return content;
}
async function processPdf(file: string) {
try {
const fileContent = await fs.readFile(file);
const data = await pdf(fileContent);
return data.text;
} catch (error) {
throw error;
}
}

View File

@ -1,82 +0,0 @@
import { AnyNode, Cheerio, load } from "cheerio";
import { PageOptions } from "../../../lib/entities";
import { excludeNonMainTags } from "./excludeTags";
export const removeUnwantedElements = (
html: string,
pageOptions: PageOptions,
) => {
let soup = load(html);
if (
pageOptions.onlyIncludeTags &&
pageOptions.onlyIncludeTags.length > 0 &&
pageOptions.onlyIncludeTags[0] !== ""
) {
if (typeof pageOptions.onlyIncludeTags === "string") {
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
}
if (pageOptions.onlyIncludeTags.length !== 0) {
// Create a new root element to hold the tags to keep
const newRoot = load("<div></div>")("div");
pageOptions.onlyIncludeTags.forEach((tag) => {
soup(tag).each((index, element) => {
newRoot.append(soup(element).clone());
});
});
soup = load(newRoot.html());
}
}
soup("script, style, noscript, meta, head").remove();
if (
pageOptions.removeTags &&
pageOptions.removeTags.length > 0 &&
pageOptions.removeTags[0] !== ""
) {
if (typeof pageOptions.removeTags === "string") {
pageOptions.removeTags = [pageOptions.removeTags];
}
if (Array.isArray(pageOptions.removeTags)) {
pageOptions.removeTags.forEach((tag) => {
let elementsToRemove: Cheerio<AnyNode>;
if (tag.startsWith("*") && tag.endsWith("*")) {
let classMatch = false;
const regexPattern = new RegExp(tag.slice(1, -1), "i");
elementsToRemove = soup("*").filter((i, element) => {
if (element.type === "tag") {
const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name);
const attributesMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`${attr}="${attributes[attr]}"`),
);
if (tag.startsWith("*.")) {
classMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`class="${attributes[attr]}"`),
);
}
return tagNameMatches || attributesMatch || classMatch;
}
return false;
});
} else {
elementsToRemove = soup(tag);
}
elementsToRemove.remove();
});
}
}
if (pageOptions.onlyMainContent) {
excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag);
elementsToRemove.remove();
});
}
const cleanedHtml = soup.html();
return cleanedHtml;
};

View File

@ -1,85 +0,0 @@
import { Logger } from "../../../lib/logger";
import { Document } from "../../../lib/entities";
export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
try {
documents.forEach((document) => {
const baseUrl = new URL(document.metadata.sourceURL).origin;
const paths =
document.content.match(
/!?\[.*?\]\(.*?\)|href=".+?"/g
) || [];
paths.forEach((path: string) => {
try {
const isImage = path.startsWith("!");
let matchedUrl = path.match(/\((.*?)\)/) || path.match(/href="([^"]+)"/);
let url = matchedUrl[1];
if (!url.startsWith("data:") && !url.startsWith("http")) {
if (url.startsWith("/")) {
url = url.substring(1);
}
url = new URL(url, baseUrl).toString();
}
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
// Image is handled afterwards
if (!isImage) {
document.content = document.content.replace(
path,
`${markdownLinkOrImageText}(${url})`
);
}
} catch (error) {
}
});
document.markdown = document.content;
});
return documents;
} catch (error) {
Logger.debug(`Error replacing paths with absolute paths: ${error}`);
return documents;
}
};
export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
try {
documents.forEach((document) => {
const baseUrl = new URL(document.metadata.sourceURL).origin;
const images =
document.content.match(
/!\[.*?\]\(.*?\)/g
) || [];
images.forEach((image: string) => {
let imageUrl = image.match(/\((.*?)\)/)[1];
let altText = image.match(/\[(.*?)\]/)[1];
if (!imageUrl.startsWith("data:image")) {
if (!imageUrl.startsWith("http")) {
if (imageUrl.startsWith("/")) {
imageUrl = imageUrl.substring(1);
imageUrl = new URL(imageUrl, baseUrl).toString();
} else {
imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
}
}
}
document.content = document.content.replace(
image,
`![${altText}](${imageUrl})`
);
});
document.markdown = document.content;
});
return documents;
} catch (error) {
Logger.error(`Error replacing img paths with absolute paths: ${error}`);
return documents;
}
};

View File

@ -1,59 +0,0 @@
import axios from "axios";
import * as cheerio from "cheerio";
import { Logger } from "../../../lib/logger";
export async function attemptScrapWithRequests(
urlToScrap: string
): Promise<string | null> {
try {
const response = await axios.get(urlToScrap, { timeout: 15000 });
if (!response.data) {
Logger.debug("Failed normal requests as well");
return null;
}
return response.data;
} catch (error) {
Logger.debug(`Error in attemptScrapWithRequests: ${error}`);
return null;
}
}
export function sanitizeText(text: string): string {
return text.replace("\u0000", "");
}
export function extractLinks(html: string, baseUrl: string): string[] {
const $ = cheerio.load(html);
const links: string[] = [];
$('a').each((_, element) => {
const href = $(element).attr('href');
if (href) {
try {
if (href.startsWith('http://') || href.startsWith('https://')) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith('/')) {
// Relative URL starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
// Relative URL not starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (href.startsWith('mailto:')) {
// mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
} catch (error) {
// Log the error and continue
console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error);
}
}
});
// Remove duplicates and return
return [...new Set(links)];
}

View File

@ -0,0 +1,25 @@
# `scrapeURL`
New URL scraper for Firecrawl
## Signal flow
```mermaid
flowchart TD;
scrapeURL-.->buildFallbackList;
buildFallbackList-.->scrapeURLWithEngine;
scrapeURLWithEngine-.->parseMarkdown;
parseMarkdown-.->wasScrapeSuccessful{{Was scrape successful?}};
wasScrapeSuccessful-."No".->areEnginesLeft{{Are there engines left to try?}};
areEnginesLeft-."Yes, try next engine".->scrapeURLWithEngine;
areEnginesLeft-."No".->NoEnginesLeftError[/NoEnginesLeftError/]
wasScrapeSuccessful-."Yes".->asd;
```
## Differences from `WebScraperDataProvider`
- The job of `WebScraperDataProvider.validateInitialUrl` has been delegated to the zod layer above `scrapeUrl`.
- `WebScraperDataProvider.mode` has no equivalent, only `scrape_url` is supported.
- You may no longer specify multiple URLs.
- Built on `v1` definitons, instead of `v0`.
- PDFs are now converted straight to markdown using LlamaParse, instead of converting to just plaintext.
- DOCXs are now converted straight to HTML (and then later to markdown) using mammoth, instead of converting to just plaintext.
- Using new JSON Schema OpenAI API -- schema fails with LLM Extract will be basically non-existant.

View File

@ -0,0 +1,15 @@
import { Meta } from "../..";
import { EngineScrapeResult } from "..";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import mammoth from "mammoth";
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
return {
url: response.url,
statusCode: response.status,
html: (await mammoth.convertToHtml({ path: tempFilePath })).value,
}
}

View File

@ -0,0 +1,28 @@
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { TimeoutError } from "../../error";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
export async function scrapeURLWithFetch(meta: Meta): Promise<EngineScrapeResult> {
const timeout = 20000;
const response = await Promise.race([
fetch(meta.url, {
redirect: "follow",
headers: meta.options.headers,
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
throw new TimeoutError("Fetch was unable to scrape the page before timing out", { cause: { timeout } });
})()
]);
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), Object.fromEntries(response.headers as any));
return {
url: response.url,
html: await response.text(),
statusCode: response.status,
// TODO: error?
};
}

View File

@ -0,0 +1,107 @@
import { Logger } from "winston";
import * as Sentry from "@sentry/node";
import { z } from "zod";
import { robustFetch } from "../../lib/fetch";
import { EngineError } from "../../error";
const successSchema = z.object({
jobId: z.string(),
state: z.literal("completed"),
processing: z.literal(false),
// timeTaken: z.number(),
content: z.string(),
url: z.string().optional(),
pageStatusCode: z.number(),
pageError: z.string().optional(),
// TODO: this needs to be non-optional, might need fixes on f-e side to ensure reliability
responseHeaders: z.record(z.string(), z.string()).optional(),
// timeTakenCookie: z.number().optional(),
// timeTakenRequest: z.number().optional(),
// legacy: playwright only
screenshot: z.string().optional(),
// new: actions
screenshots: z.string().array().optional(),
actionContent: z.object({
url: z.string(),
html: z.string(),
}).array().optional(),
})
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
const processingSchema = z.object({
jobId: z.string(),
state: z.enum(["delayed", "active", "waiting", "waiting-children", "unknown"]),
processing: z.boolean(),
});
const failedSchema = z.object({
jobId: z.string(),
state: z.literal("failed"),
processing: z.literal(false),
error: z.string(),
});
export class StillProcessingError extends Error {
constructor(jobId: string) {
super("Job is still under processing", { cause: { jobId } })
}
}
export async function fireEngineCheckStatus(logger: Logger, jobId: string): Promise<FireEngineCheckStatusSuccess> {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
const status = await Sentry.startSpan({
name: "fire-engine: Check status",
attributes: {
jobId,
}
}, async span => {
return await robustFetch(
{
url: `${fireEngineURL}/scrape/${jobId}`,
method: "GET",
logger: logger.child({ method: "fireEngineCheckStatus/robustFetch" }),
headers: {
...(Sentry.isInitialized() ? ({
"sentry-trace": Sentry.spanToTraceHeader(span),
"baggage": Sentry.spanToBaggageHeader(span),
}) : {}),
},
}
)
});
const successParse = successSchema.safeParse(status);
const processingParse = processingSchema.safeParse(status);
const failedParse = failedSchema.safeParse(status);
if (successParse.success) {
logger.debug("Scrape succeeded!", { jobId });
return successParse.data;
} else if (processingParse.success) {
logger.debug("Scrape is still processing", { jobId });
throw new StillProcessingError(jobId);
} else if (failedParse.success) {
logger.debug("Scrape job failed", { status, jobId });
throw new EngineError("Scrape job failed", {
cause: {
status, jobId
}
});
} else {
logger.debug("Check status returned response not matched by any schema", { status, jobId });
throw new Error("Check status returned response not matched by any schema", {
cause: {
status, jobId
}
});
}
}

View File

@ -0,0 +1,33 @@
import { Logger } from "winston";
import * as Sentry from "@sentry/node";
import { robustFetch } from "../../lib/fetch";
export async function fireEngineDelete(logger: Logger, jobId: string) {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
await Sentry.startSpan({
name: "fire-engine: Delete scrape",
attributes: {
jobId,
}
}, async span => {
await robustFetch(
{
url: `${fireEngineURL}/scrape/${jobId}`,
method: "DELETE",
headers: {
...(Sentry.isInitialized() ? ({
"sentry-trace": Sentry.spanToTraceHeader(span),
"baggage": Sentry.spanToBaggageHeader(span),
}) : {}),
},
ignoreResponse: true,
ignoreFailure: true,
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
}
)
});
// We do not care whether this fails or not.
}

View File

@ -0,0 +1,198 @@
import { Logger } from "winston";
import { Meta } from "../..";
import { fireEngineScrape, FireEngineScrapeRequestChromeCDP, FireEngineScrapeRequestCommon, FireEngineScrapeRequestPlaywright, FireEngineScrapeRequestTLSClient } from "./scrape";
import { EngineScrapeResult } from "..";
import { fireEngineCheckStatus, FireEngineCheckStatusSuccess, StillProcessingError } from "./checkStatus";
import { EngineError, TimeoutError } from "../../error";
import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
const defaultTimeout = 20000;
// This function does not take `Meta` on purpose. It may not access any
// meta values to construct the request -- that must be done by the
// `scrapeURLWithFireEngine*` functions.
async function performFireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient>(
logger: Logger,
request: FireEngineScrapeRequestCommon & Engine,
timeout = defaultTimeout,
): Promise<FireEngineCheckStatusSuccess> {
const scrape = await fireEngineScrape(logger.child({ method: "fireEngineScrape" }), request);
const startTime = Date.now();
const errorLimit = 3;
let errors: any[] = [];
let status: FireEngineCheckStatusSuccess | undefined = undefined;
while (status === undefined) {
if (errors.length >= errorLimit) {
logger.error("Error limit hit.", { errors });
throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors } });
}
if (Date.now() - startTime > timeout) {
logger.info("Fire-engine was unable to scrape the page before timing out.", { errors, timeout });
throw new TimeoutError("Fire-engine was unable to scrape the page before timing out", { cause: { errors, timeout } });
}
try {
status = await fireEngineCheckStatus(logger.child({ method: "fireEngineCheckStatus" }), scrape.jobId)
} catch (error) {
if (error instanceof StillProcessingError) {
logger.debug("Scrape is still processing...");
} else if (error instanceof EngineError) {
logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId });
throw error;
} else {
Sentry.captureException(error);
errors.push(error);
logger.debug(`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`, { error, jobId: scrape.jobId });
}
}
await new Promise((resolve) => setTimeout(resolve, 250));
}
return status;
}
export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<EngineScrapeResult> {
const actions: Action[] = [
// Transform waitFor option into an action (unsupported by chrome-cdp)
...(meta.options.waitFor !== 0 ? [{
type: "wait" as const,
milliseconds: meta.options.waitFor,
}] : []),
// Transform screenshot format into an action (unsupported by chrome-cdp)
...(meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") ? [{
type: "screenshot" as const,
fullPage: meta.options.formats.includes("screenshot@fullPage"),
}] : []),
// Include specified actions
...(meta.options.actions ?? []),
];
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestChromeCDP = {
url: meta.url,
engine: "chrome-cdp",
instantReturn: true,
skipTlsVerification: meta.options.skipTlsVerification,
headers: meta.options.headers,
...(actions.length > 0 ? ({
actions,
}) : {}),
priority: meta.internalOptions.priority,
geolocation: meta.options.geolocation,
mobile: meta.options.mobile,
// TODO: scrollXPaths
};
let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request,
);
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
if (meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage")) {
meta.logger.debug("Transforming screenshots from actions into screenshot field", { screenshots: response.screenshots });
response.screenshot = (response.screenshots ?? [])[0];
(response.screenshots ?? []).splice(0, 1);
meta.logger.debug("Screenshot transformation done", { screenshots: response.screenshots, screenshot: response.screenshot });
}
if (!response.url) {
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
}
return {
url: response.url ?? meta.url,
html: response.content,
error: response.pageError,
statusCode: response.pageStatusCode,
screenshot: response.screenshot,
...(actions.length > 0 ? {
actions: {
screenshots: response.screenshots ?? [],
scrapes: response.actionContent ?? [],
}
} : {}),
};
}
export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<EngineScrapeResult> {
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestPlaywright = {
url: meta.url,
engine: "playwright",
instantReturn: true,
headers: meta.options.headers,
priority: meta.internalOptions.priority,
screenshot: meta.options.formats.includes("screenshot"),
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
wait: meta.options.waitFor,
geolocation: meta.options.geolocation,
};
let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request,
);
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
if (!response.url) {
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
}
return {
url: response.url ?? meta.url,
html: response.content,
error: response.pageError,
statusCode: response.pageStatusCode,
...(response.screenshots !== undefined && response.screenshots.length > 0 ? ({
screenshot: response.screenshots[0],
}) : {}),
};
}
export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<EngineScrapeResult> {
const request: FireEngineScrapeRequestCommon & FireEngineScrapeRequestTLSClient = {
url: meta.url,
engine: "tlsclient",
instantReturn: true,
headers: meta.options.headers,
priority: meta.internalOptions.priority,
atsv: meta.internalOptions.atsv,
geolocation: meta.options.geolocation,
disableJsDom: meta.internalOptions.v0DisableJsDom,
};
let response = await performFireEngineScrape(
meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
request,
);
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);
if (!response.url) {
meta.logger.warn("Fire-engine did not return the response's URL", { response, sourceURL: meta.url });
}
return {
url: response.url ?? meta.url,
html: response.content,
error: response.pageError,
statusCode: response.pageStatusCode,
};
}

View File

@ -0,0 +1,94 @@
import { Logger } from "winston";
import * as Sentry from "@sentry/node";
import { z } from "zod";
import { Action } from "../../../../lib/entities";
import { robustFetch } from "../../lib/fetch";
export type FireEngineScrapeRequestCommon = {
url: string;
headers?: { [K: string]: string };
blockMedia?: boolean; // default: true
blockAds?: boolean; // default: true
// pageOptions?: any; // unused, .scrollXPaths is considered on FE side
// useProxy?: boolean; // unused, default: true
// customProxy?: string; // unused
// disableSmartWaitCache?: boolean; // unused, default: false
// skipDnsCheck?: boolean; // unused, default: false
priority?: number; // default: 1
// team_id?: string; // unused
logRequest?: boolean; // default: true
instantReturn?: boolean; // default: false
geolocation?: { country?: string; languages?: string[]; };
}
export type FireEngineScrapeRequestChromeCDP = {
engine: "chrome-cdp";
skipTlsVerification?: boolean;
actions?: Action[];
blockMedia?: true; // cannot be false
mobile?: boolean;
};
export type FireEngineScrapeRequestPlaywright = {
engine: "playwright";
blockAds?: boolean; // default: true
// mutually exclusive, default: false
screenshot?: boolean;
fullPageScreenshot?: boolean;
wait?: number; // default: 0
};
export type FireEngineScrapeRequestTLSClient = {
engine: "tlsclient";
atsv?: boolean; // v0 only, default: false
disableJsDom?: boolean; // v0 only, default: false
// blockAds?: boolean; // default: true
};
const schema = z.object({
jobId: z.string(),
processing: z.boolean(),
});
export async function fireEngineScrape<Engine extends FireEngineScrapeRequestChromeCDP | FireEngineScrapeRequestPlaywright | FireEngineScrapeRequestTLSClient> (
logger: Logger,
request: FireEngineScrapeRequestCommon & Engine,
): Promise<z.infer<typeof schema>> {
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
// TODO: retries
const scrapeRequest = await Sentry.startSpan({
name: "fire-engine: Scrape",
attributes: {
url: request.url,
},
}, async span => {
return await robustFetch(
{
url: `${fireEngineURL}/scrape`,
method: "POST",
headers: {
...(Sentry.isInitialized() ? ({
"sentry-trace": Sentry.spanToTraceHeader(span),
"baggage": Sentry.spanToBaggageHeader(span),
}) : {}),
},
body: request,
logger: logger.child({ method: "fireEngineScrape/robustFetch" }),
schema,
tryCount: 3,
}
);
});
return scrapeRequest;
}

View File

@ -0,0 +1,295 @@
import { ScrapeActionContent } from "../../../lib/entities";
import { Meta } from "..";
import { scrapeDOCX } from "./docx";
import { scrapeURLWithFireEngineChromeCDP, scrapeURLWithFireEnginePlaywright, scrapeURLWithFireEngineTLSClient } from "./fire-engine";
import { scrapePDF } from "./pdf";
import { scrapeURLWithScrapingBee } from "./scrapingbee";
import { scrapeURLWithFetch } from "./fetch";
import { scrapeURLWithPlaywright } from "./playwright";
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
export const engines: Engine[] = [
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
...(usePlaywright ? [ "playwright" as const ] : []),
"fetch",
"pdf",
"docx",
];
export const featureFlags = [
"actions",
"waitFor",
"screenshot",
"screenshot@fullScreen",
"pdf",
"docx",
"atsv",
"location",
"mobile",
"skipTlsVerification",
"useFastMode",
] as const;
export type FeatureFlag = typeof featureFlags[number];
export const featureFlagOptions: {
[F in FeatureFlag]: {
priority: number;
}
} = {
"actions": { priority: 20 },
"waitFor": { priority: 1 },
"screenshot": { priority: 10 },
"screenshot@fullScreen": { priority: 10 },
"pdf": { priority: 100 },
"docx": { priority: 100 },
"atsv": { priority: 90 }, // NOTE: should atsv force to tlsclient? adjust priority if not
"useFastMode": { priority: 90 },
"location": { priority: 10 },
"mobile": { priority: 10 },
"skipTlsVerification": { priority: 10 },
} as const;
export type EngineScrapeResult = {
url: string;
html: string;
markdown?: string;
statusCode: number;
error?: string;
screenshot?: string;
actions?: {
screenshots: string[];
scrapes: ScrapeActionContent[];
};
}
const engineHandlers: {
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
} = {
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
"scrapingbee": scrapeURLWithScrapingBee("domcontentloaded"),
"scrapingbeeLoad": scrapeURLWithScrapingBee("networkidle2"),
"playwright": scrapeURLWithPlaywright,
"fetch": scrapeURLWithFetch,
"pdf": scrapePDF,
"docx": scrapeDOCX,
};
export const engineOptions: {
[E in Engine]: {
// A list of feature flags the engine supports.
features: { [F in FeatureFlag]: boolean },
// This defines the order of engines in general. The engine with the highest quality will be used the most.
// Negative quality numbers are reserved for specialty engines, e.g. PDF and DOCX
quality: number,
}
} = {
"fire-engine;chrome-cdp": {
features: {
"actions": true,
"waitFor": true, // through actions transform
"screenshot": true, // through actions transform
"screenshot@fullScreen": true, // through actions transform
"pdf": false,
"docx": false,
"atsv": false,
"location": true,
"mobile": true,
"skipTlsVerification": true,
"useFastMode": false,
},
quality: 50,
},
"fire-engine;playwright": {
features: {
"actions": false,
"waitFor": true,
"screenshot": true,
"screenshot@fullScreen": true,
"pdf": false,
"docx": false,
"atsv": false,
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": false,
},
quality: 40,
},
"scrapingbee": {
features: {
"actions": false,
"waitFor": true,
"screenshot": true,
"screenshot@fullScreen": true,
"pdf": false,
"docx": false,
"atsv": false,
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": false,
},
quality: 30,
},
"scrapingbeeLoad": {
features: {
"actions": false,
"waitFor": true,
"screenshot": true,
"screenshot@fullScreen": true,
"pdf": false,
"docx": false,
"atsv": false,
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": false,
},
quality: 29,
},
"playwright": {
features: {
"actions": false,
"waitFor": true,
"screenshot": false,
"screenshot@fullScreen": false,
"pdf": false,
"docx": false,
"atsv": false,
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": false,
},
quality: 20,
},
"fire-engine;tlsclient": {
features: {
"actions": false,
"waitFor": false,
"screenshot": false,
"screenshot@fullScreen": false,
"pdf": false,
"docx": false,
"atsv": true,
"location": true,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": true,
},
quality: 10,
},
"fetch": {
features: {
"actions": false,
"waitFor": false,
"screenshot": false,
"screenshot@fullScreen": false,
"pdf": false,
"docx": false,
"atsv": false,
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": true,
},
quality: 5,
},
"pdf": {
features: {
"actions": false,
"waitFor": false,
"screenshot": false,
"screenshot@fullScreen": false,
"pdf": true,
"docx": false,
"atsv": false,
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": true,
},
quality: -10,
},
"docx": {
features: {
"actions": false,
"waitFor": false,
"screenshot": false,
"screenshot@fullScreen": false,
"pdf": false,
"docx": true,
"atsv": false,
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": true,
},
quality: -10,
},
};
export function buildFallbackList(meta: Meta): {
engine: Engine,
unsupportedFeatures: Set<FeatureFlag>,
}[] {
const prioritySum = [...meta.featureFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
const priorityThreshold = Math.floor(prioritySum / 2);
let selectedEngines: {
engine: Engine,
supportScore: number,
unsupportedFeatures: Set<FeatureFlag>,
}[] = [];
const currentEngines = meta.internalOptions.forceEngine !== undefined ? [meta.internalOptions.forceEngine] : engines;
for (const engine of currentEngines) {
const supportedFlags = new Set([...Object.entries(engineOptions[engine].features).filter(([k, v]) => meta.featureFlags.has(k as FeatureFlag) && v === true).map(([k, _]) => k)]);
const supportScore = [...supportedFlags].reduce((a, x) => a + featureFlagOptions[x].priority, 0);
const unsupportedFeatures = new Set([...meta.featureFlags]);
for (const flag of meta.featureFlags) {
if (supportedFlags.has(flag)) {
unsupportedFeatures.delete(flag);
}
}
if (supportScore >= priorityThreshold) {
selectedEngines.push({ engine, supportScore, unsupportedFeatures });
meta.logger.debug(`Engine ${engine} meets feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures });
} else {
meta.logger.debug(`Engine ${engine} does not meet feature priority threshold`, { supportScore, prioritySum, priorityThreshold, featureFlags: [...meta.featureFlags], unsupportedFeatures});
}
}
if (selectedEngines.some(x => engineOptions[x.engine].quality > 0)) {
selectedEngines = selectedEngines.filter(x => engineOptions[x.engine].quality > 0);
}
selectedEngines.sort((a,b) => b.supportScore - a.supportScore || engineOptions[b.engine].quality - engineOptions[a.engine].quality);
return selectedEngines;
}
export async function scrapeURLWithEngine(meta: Meta, engine: Engine): Promise<EngineScrapeResult> {
const fn = engineHandlers[engine];
const logger = meta.logger.child({ method: fn.name ?? "scrapeURLWithEngine", engine });
const _meta = {
...meta,
logger,
};
return await fn(_meta);
}

View File

@ -0,0 +1,114 @@
import { createReadStream, promises as fs } from "node:fs";
import FormData from "form-data";
import { Meta } from "../..";
import { EngineScrapeResult } from "..";
import * as marked from "marked";
import { robustFetch } from "../../lib/fetch";
import { z } from "zod";
import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
type PDFProcessorResult = {html: string, markdown?: string};
async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
const uploadForm = new FormData();
uploadForm.append("file", createReadStream(tempFilePath), {
filename: tempFilePath,
contentType: "application/pdf", // NOTE: request.headers["Content-Type"]?
});
const upload = await robustFetch({
url: "https://api.cloud.llamaindex.ai/api/parsing/upload",
method: "POST",
headers: {
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
},
body: uploadForm,
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/upload/robustFetch" }),
schema: z.object({
id: z.string(),
}),
});
const jobId = upload.id;
// TODO: timeout, retries
const result = await robustFetch({
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
method: "GET",
headers: {
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
},
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
schema: z.object({
markdown: z.string(),
}),
});
return {
markdown: result.markdown,
html: await marked.parse(result.markdown, { async: true }),
};
}
async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
const result = await PdfParse(await fs.readFile(tempFilePath));
const escaped = escapeHtml(result.text);
return {
markdown: escaped,
html: escaped,
};
}
export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url);
const content = file.buffer.toString("base64");
return {
url: file.response.url,
statusCode: file.response.status,
html: content,
markdown: content,
};
}
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
let result: PDFProcessorResult | null = null;
if (process.env.LLAMAPARSE_API_KEY) {
try {
result = await scrapePDFWithLlamaParse({
...meta,
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
}, tempFilePath);
} catch (error) {
meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
Sentry.captureException(error);
}
}
if (result === null) {
result = await scrapePDFWithParsePDF({
...meta,
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithParsePDF" }),
}, tempFilePath);
}
await fs.unlink(tempFilePath);
return {
url: response.url,
statusCode: response.status,
html: result.html,
markdown: result.markdown,
}
}

View File

@ -0,0 +1,42 @@
import { z } from "zod";
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { TimeoutError } from "../../error";
import { robustFetch } from "../../lib/fetch";
export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeResult> {
const timeout = 20000 + meta.options.waitFor;
const response = await Promise.race([
await robustFetch({
url: process.env.PLAYWRIGHT_MICROSERVICE_URL!,
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
url: meta.url,
wait_after_load: meta.options.waitFor,
timeout,
headers: meta.options.headers,
}),
method: "POST",
logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
schema: z.object({
content: z.string(),
pageStatusCode: z.number(),
pageError: z.string().optional(),
}),
}),
(async () => {
await new Promise((resolve) => setTimeout(() => resolve(null), 20000));
throw new TimeoutError("Playwright was unable to scrape the page before timing out", { cause: { timeout } });
})(),
]);
return {
url: meta.url, // TODO: impove redirect following
html: response.content,
statusCode: response.pageStatusCode,
error: response.pageError,
}
}

View File

@ -0,0 +1,66 @@
import { ScrapingBeeClient } from "scrapingbee";
import { Meta } from "../..";
import { EngineScrapeResult } from "..";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
import { AxiosError, type AxiosResponse } from "axios";
import { EngineError } from "../../error";
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY!);
export function scrapeURLWithScrapingBee(wait_browser: "domcontentloaded" | "networkidle2"): ((meta: Meta) => Promise<EngineScrapeResult>) {
return async (meta: Meta): Promise<EngineScrapeResult> => {
let response: AxiosResponse<any>;
try {
response = await client.get({
url: meta.url,
params: {
timeout: 15000, // TODO: dynamic timeout based on request timeout
wait_browser: wait_browser,
wait: Math.min(meta.options.waitFor, 35000),
transparent_status_code: true,
json_response: true,
screenshot: meta.options.formats.includes("screenshot"),
screenshot_full_page: meta.options.formats.includes("screenshot@fullPage"),
},
headers: {
"ScrapingService-Request": "TRUE", // this is sent to the page, not to ScrapingBee - mogery
},
});
} catch (error) {
if (error instanceof AxiosError && error.response !== undefined) {
response = error.response;
} else {
throw error;
}
}
const data: Buffer = response.data;
const body = JSON.parse(new TextDecoder().decode(data));
const headers = body.headers ?? {};
const isHiddenEngineError = !(headers["Date"] ?? headers["date"] ?? headers["Content-Type"] ?? headers["content-type"]);
if (body.errors || body.body?.error || isHiddenEngineError) {
meta.logger.error("ScrapingBee threw an error", { body: body.body?.error ?? body.errors ?? body.body ?? body });
throw new EngineError("Engine error #34", { cause: { body, statusCode: response.status } });
}
if (typeof body.body !== "string") {
meta.logger.error("ScrapingBee: Body is not string??", { body });
throw new EngineError("Engine error #35", { cause: { body, statusCode: response.status } });
}
specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithScrapingBee/specialtyScrapeCheck" }), body.headers);
return {
url: body["resolved-url"] ?? meta.url,
html: body.body,
error: response.status >= 300 ? response.statusText : undefined,
statusCode: response.status,
...(body.screenshot ? ({
screenshot: `data:image/png;base64,${body.screenshot}`,
}) : {}),
};
};
}

View File

@ -0,0 +1,45 @@
import path from "path";
import os from "os";
import { createWriteStream, promises as fs } from "node:fs";
import { EngineError } from "../../error";
import { Writable } from "stream";
import { v4 as uuid } from "uuid";
export async function fetchFileToBuffer(url: string): Promise<{
response: Response,
buffer: Buffer
}> {
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
return {
response,
buffer: Buffer.from(await response.arrayBuffer()),
};
}
export async function downloadFile(id: string, url: string): Promise<{
response: Response
tempFilePath: string
}> {
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
const tempFileWrite = createWriteStream(tempFilePath);
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
// This should never happen in the current state of JS (2024), but let's check anyways.
if (response.body === null) {
throw new EngineError("Response body was null", { cause: { response } });
}
response.body.pipeTo(Writable.toWeb(tempFileWrite));
await new Promise((resolve, reject) => {
tempFileWrite.on("finish", () => resolve(null));
tempFileWrite.on("error", (error) => {
reject(new EngineError("Failed to write to temp file", { cause: { error } }));
});
})
return {
response,
tempFilePath,
};
}

View File

@ -0,0 +1,14 @@
import { Logger } from "winston";
import { AddFeatureError } from "../../error";
export function specialtyScrapeCheck(logger: Logger, headers: Record<string, string> | undefined) {
const contentType = (Object.entries(headers ?? {}).find(x => x[0].toLowerCase() === "content-type") ?? [])[1];
if (contentType === undefined) {
logger.warn("Failed to check contentType -- was not present in headers", { headers });
} else if (contentType === "application/pdf" || contentType.startsWith("application/pdf;")) { // .pdf
throw new AddFeatureError(["pdf"]);
} else if (contentType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || contentType.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document;")) { // .docx
throw new AddFeatureError(["docx"]);
}
}

View File

@ -0,0 +1,34 @@
import { EngineResultsTracker } from "."
import { Engine, FeatureFlag } from "./engines"
export class EngineError extends Error {
constructor(message?: string, options?: ErrorOptions) {
super(message, options)
}
}
export class TimeoutError extends Error {
constructor(message?: string, options?: ErrorOptions) {
super(message, options)
}
}
export class NoEnginesLeftError extends Error {
public fallbackList: Engine[];
public results: EngineResultsTracker;
constructor(fallbackList: Engine[], results: EngineResultsTracker) {
super("All scraping engines failed!");
this.fallbackList = fallbackList;
this.results = results;
}
}
export class AddFeatureError extends Error {
public featureFlags: FeatureFlag[];
constructor(featureFlags: FeatureFlag[]) {
super("New feature flags have been discovered: " + featureFlags.join(", "));
this.featureFlags = featureFlags;
}
}

Some files were not shown because too many files have changed in this diff Show More