From 8d467c8ca764afd287a9c27717e537e850781d74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 7 Nov 2024 20:57:33 +0100 Subject: [PATCH] `WebScraper` refactor into `scrapeURL` (#714) * feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas --- .github/archive/js-sdk.yml | 2 - .github/archive/python-sdk.yml | 2 - .github/archive/rust-sdk.yml | 2 - .github/workflows/ci.yml | 2 - .github/workflows/deploy-image-staging.yml | 32 + CONTRIBUTING.md | 1 - SELF_HOST.md | 1 - apps/api/.env.example | 5 - apps/api/jest.setup.js | 2 +- apps/api/package.json | 10 +- apps/api/pnpm-lock.yaml | 1772 ++--------------- apps/api/requests.http | 16 +- apps/api/sharedLibs/go-html-to-md/.gitignore | 2 + .../__tests__/e2e_full_withAuth/index.test.ts | 2 +- .../src/__tests__/e2e_noAuth/index.test.ts | 1 - .../__tests__/e2e_v1_withAuth/index.test.ts | 26 +- .../e2e_v1_withAuth_all_params/index.test.ts | 603 ++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 3 +- apps/api/src/controllers/auth.ts | 50 +- .../controllers/v0/admin/acuc-cache-clear.ts | 6 +- apps/api/src/controllers/v0/admin/queue.ts | 18 +- .../src/controllers/v0/admin/redis-health.ts | 16 +- apps/api/src/controllers/v0/crawl-cancel.ts | 14 +- apps/api/src/controllers/v0/crawl-status.ts | 20 +- apps/api/src/controllers/v0/crawl.ts | 41 +- apps/api/src/controllers/v0/crawlPreview.ts | 34 +- apps/api/src/controllers/v0/keyAuth.ts | 7 +- apps/api/src/controllers/v0/scrape.ts | 62 +- apps/api/src/controllers/v0/search.ts | 38 +- apps/api/src/controllers/v0/status.ts | 4 +- apps/api/src/controllers/v1/batch-scrape.ts | 16 +- apps/api/src/controllers/v1/crawl-cancel.ts | 6 +- .../api/src/controllers/v1/crawl-status-ws.ts | 32 +- apps/api/src/controllers/v1/crawl-status.ts | 17 +- apps/api/src/controllers/v1/crawl.ts | 41 +- apps/api/src/controllers/v1/map.ts | 25 +- apps/api/src/controllers/v1/scrape-status.ts | 2 +- apps/api/src/controllers/v1/scrape.ts | 64 +- apps/api/src/controllers/v1/types.ts | 161 +- apps/api/src/example.ts | 19 - apps/api/src/index.ts | 33 +- apps/api/src/lib/LLM-extraction/index.ts | 12 +- apps/api/src/lib/LLM-extraction/models.ts | 2 +- apps/api/src/lib/batch-process.ts | 2 +- apps/api/src/lib/crawl-redis.ts | 15 +- apps/api/src/lib/entities.ts | 5 +- apps/api/src/lib/html-to-markdown.ts | 8 +- apps/api/src/lib/job-priority.ts | 10 +- apps/api/src/lib/load-testing-example.ts | 42 - apps/api/src/lib/logger.ts | 119 +- apps/api/src/lib/map-cosine.ts | 4 +- apps/api/src/lib/scrape-events.ts | 12 +- apps/api/src/lib/supabase-jobs.ts | 6 +- apps/api/src/lib/withAuth.ts | 17 +- apps/api/src/main/runWebScraper.ts | 164 +- apps/api/src/routes/admin.ts | 2 +- apps/api/src/routes/v1.ts | 25 +- .../WebScraper/__tests__/crawler.test.ts | 160 -- .../WebScraper/__tests__/single_url.test.ts | 37 - apps/api/src/scraper/WebScraper/crawler.ts | 197 +- .../WebScraper/custom/handleCustomScraping.ts | 8 +- apps/api/src/scraper/WebScraper/global.ts | 1 - apps/api/src/scraper/WebScraper/index.ts | 743 ------- .../src/scraper/WebScraper/scrapers/fetch.ts | 89 - .../scraper/WebScraper/scrapers/fireEngine.ts | 230 --- .../scraper/WebScraper/scrapers/playwright.ts | 111 -- .../WebScraper/scrapers/scrapingBee.ts | 92 - apps/api/src/scraper/WebScraper/single_url.ts | 506 ----- apps/api/src/scraper/WebScraper/sitemap.ts | 18 +- .../utils/__tests__/docxProcessor.test.ts | 15 - .../utils/__tests__/parseTable.test.ts | 128 -- .../utils/__tests__/pdfProcessor.test.ts | 19 - .../__tests__/removeUnwantedElements.test.ts | 192 -- .../utils/__tests__/replacePaths.test.ts | 127 -- .../utils/__tests__/socialBlockList.test.ts | 66 - .../src/scraper/WebScraper/utils/blocklist.ts | 4 +- .../WebScraper/utils/custom/website_params.ts | 198 -- .../scraper/WebScraper/utils/docxProcessor.ts | 79 - .../scraper/WebScraper/utils/excludeTags.ts | 42 - .../WebScraper/utils/imageDescription.ts | 89 - .../src/scraper/WebScraper/utils/metadata.ts | 185 -- .../scraper/WebScraper/utils/parseTable.ts | 74 - .../scraper/WebScraper/utils/pdfProcessor.ts | 140 -- .../utils/removeUnwantedElements.ts | 82 - .../scraper/WebScraper/utils/replacePaths.ts | 85 - .../api/src/scraper/WebScraper/utils/utils.ts | 59 - apps/api/src/scraper/scrapeURL/README.md | 25 + .../scraper/scrapeURL/engines/docx/index.ts | 15 + .../scraper/scrapeURL/engines/fetch/index.ts | 28 + .../engines/fire-engine/checkStatus.ts | 107 + .../scrapeURL/engines/fire-engine/delete.ts | 33 + .../scrapeURL/engines/fire-engine/index.ts | 198 ++ .../scrapeURL/engines/fire-engine/scrape.ts | 94 + .../src/scraper/scrapeURL/engines/index.ts | 295 +++ .../scraper/scrapeURL/engines/pdf/index.ts | 114 ++ .../scrapeURL/engines/playwright/index.ts | 42 + .../scrapeURL/engines/scrapingbee/index.ts | 66 + .../scrapeURL/engines/utils/downloadFile.ts | 45 + .../engines/utils/specialtyHandler.ts | 14 + apps/api/src/scraper/scrapeURL/error.ts | 34 + apps/api/src/scraper/scrapeURL/index.ts | 320 +++ .../src/scraper/scrapeURL/lib/extractLinks.ts | 35 + .../scraper/scrapeURL/lib/extractMetadata.ts | 132 ++ apps/api/src/scraper/scrapeURL/lib/fetch.ts | 144 ++ .../scrapeURL/lib/removeUnwantedElements.ts | 111 ++ .../scrapeURL/lib/urlSpecificParams.ts | 51 + .../src/scraper/scrapeURL/scrapeURL.test.ts | 388 ++++ .../scraper/scrapeURL/transformers/index.ts | 130 ++ .../scrapeURL/transformers/llmExtract.ts | 131 ++ .../transformers/removeBase64Images.ts | 11 + .../transformers/uploadScreenshot.ts | 26 + apps/api/src/search/fireEngine.ts | 4 +- apps/api/src/search/googlesearch.ts | 12 +- apps/api/src/search/index.ts | 30 +- apps/api/src/services/alerts/index.ts | 14 +- apps/api/src/services/alerts/slack.ts | 6 +- apps/api/src/services/billing/auto_charge.ts | 16 +- .../src/services/billing/credit_billing.ts | 55 +- .../api/src/services/billing/issue_credits.ts | 6 +- apps/api/src/services/billing/stripe.ts | 12 +- apps/api/src/services/idempotency/create.ts | 4 +- apps/api/src/services/idempotency/validate.ts | 6 +- apps/api/src/services/logging/crawl_log.ts | 4 +- apps/api/src/services/logging/log_job.ts | 20 +- apps/api/src/services/logging/scrape_log.ts | 8 +- apps/api/src/services/logtail.ts | 20 - .../notification/email_notification.ts | 26 +- apps/api/src/services/posthog.ts | 10 +- apps/api/src/services/queue-jobs.ts | 4 +- apps/api/src/services/queue-service.ts | 6 +- apps/api/src/services/queue-worker.ts | 167 +- apps/api/src/services/rate-limiter.test.ts | 2 +- apps/api/src/services/rate-limiter.ts | 2 +- apps/api/src/services/redis.ts | 14 +- apps/api/src/services/redlock.ts | 2 +- apps/api/src/services/sentry.ts | 4 +- apps/api/src/services/supabase.ts | 6 +- apps/api/src/services/system-monitor.ts | 8 +- apps/api/src/services/webhook.ts | 19 +- apps/api/src/types.ts | 50 +- apps/api/tsconfig.json | 5 +- .../kubernetes/cluster-install/secret.yaml | 2 - 142 files changed, 4230 insertions(+), 6334 deletions(-) create mode 100644 .github/workflows/deploy-image-staging.yml create mode 100644 apps/api/sharedLibs/go-html-to-md/.gitignore create mode 100644 apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts delete mode 100644 apps/api/src/example.ts delete mode 100644 apps/api/src/lib/load-testing-example.ts delete mode 100644 apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts delete mode 100644 apps/api/src/scraper/WebScraper/global.ts delete mode 100644 apps/api/src/scraper/WebScraper/index.ts delete mode 100644 apps/api/src/scraper/WebScraper/scrapers/fetch.ts delete mode 100644 apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts delete mode 100644 apps/api/src/scraper/WebScraper/scrapers/playwright.ts delete mode 100644 apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts delete mode 100644 apps/api/src/scraper/WebScraper/single_url.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/custom/website_params.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/docxProcessor.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/excludeTags.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/imageDescription.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/metadata.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/parseTable.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/replacePaths.ts delete mode 100644 apps/api/src/scraper/WebScraper/utils/utils.ts create mode 100644 apps/api/src/scraper/scrapeURL/README.md create mode 100644 apps/api/src/scraper/scrapeURL/engines/docx/index.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/fetch/index.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/index.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/pdf/index.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/playwright/index.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts create mode 100644 apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts create mode 100644 apps/api/src/scraper/scrapeURL/error.ts create mode 100644 apps/api/src/scraper/scrapeURL/index.ts create mode 100644 apps/api/src/scraper/scrapeURL/lib/extractLinks.ts create mode 100644 apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts create mode 100644 apps/api/src/scraper/scrapeURL/lib/fetch.ts create mode 100644 apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts create mode 100644 apps/api/src/scraper/scrapeURL/lib/urlSpecificParams.ts create mode 100644 apps/api/src/scraper/scrapeURL/scrapeURL.test.ts create mode 100644 apps/api/src/scraper/scrapeURL/transformers/index.ts create mode 100644 apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts create mode 100644 apps/api/src/scraper/scrapeURL/transformers/removeBase64Images.ts create mode 100644 apps/api/src/scraper/scrapeURL/transformers/uploadScreenshot.ts delete mode 100644 apps/api/src/services/logtail.ts diff --git a/.github/archive/js-sdk.yml b/.github/archive/js-sdk.yml index c84bb8b1..7ef096d4 100644 --- a/.github/archive/js-sdk.yml +++ b/.github/archive/js-sdk.yml @@ -8,7 +8,6 @@ env: FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} HOST: ${{ secrets.HOST }} LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} - LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} @@ -21,7 +20,6 @@ env: SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }} - HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} HDX_NODE_BETA_MODE: 1 jobs: diff --git a/.github/archive/python-sdk.yml b/.github/archive/python-sdk.yml index 27449888..bdefeab6 100644 --- a/.github/archive/python-sdk.yml +++ b/.github/archive/python-sdk.yml @@ -8,7 +8,6 @@ env: FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} HOST: ${{ secrets.HOST }} LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} - LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} @@ -21,7 +20,6 @@ env: SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }} - HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} HDX_NODE_BETA_MODE: 1 jobs: diff --git a/.github/archive/rust-sdk.yml b/.github/archive/rust-sdk.yml index 62deeaab..792e06c2 100644 --- a/.github/archive/rust-sdk.yml +++ b/.github/archive/rust-sdk.yml @@ -8,7 +8,6 @@ env: FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} HOST: ${{ secrets.HOST }} LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} - LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} @@ -21,7 +20,6 @@ env: SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }} - HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} HDX_NODE_BETA_MODE: 1 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8a9a74cc..ef7d1cba 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,6 @@ env: FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} HOST: ${{ secrets.HOST }} LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} - LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} @@ -25,7 +24,6 @@ env: SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }} - HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} HDX_NODE_BETA_MODE: 1 FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }} USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} diff --git a/.github/workflows/deploy-image-staging.yml b/.github/workflows/deploy-image-staging.yml new file mode 100644 index 00000000..e74aba9a --- /dev/null +++ b/.github/workflows/deploy-image-staging.yml @@ -0,0 +1,32 @@ +name: STAGING Deploy Images to GHCR + +env: + DOTNET_VERSION: '6.0.x' + +on: + push: + branches: + - mog/webscraper-refactor + workflow_dispatch: + +jobs: + push-app-image: + runs-on: ubuntu-latest + defaults: + run: + working-directory: './apps/api' + steps: + - name: 'Checkout GitHub Action' + uses: actions/checkout@main + + - name: 'Login to GitHub Container Registry' + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{github.actor}} + password: ${{secrets.GITHUB_TOKEN}} + + - name: 'Build Inventory Image' + run: | + docker build . --tag ghcr.io/mendableai/firecrawl-staging:latest + docker push ghcr.io/mendableai/firecrawl-staging:latest \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 73ccf0e6..b8c1f0a5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,7 +41,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) BULL_AUTH_KEY= @ -LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages diff --git a/SELF_HOST.md b/SELF_HOST.md index 78228485..46e08db9 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -62,7 +62,6 @@ TEST_API_KEY= # use if you've set up authentication and want to test with a real SCRAPING_BEE_API_KEY= # use if you'd like to use as a fallback scraper OPENAI_API_KEY= # add for LLM-dependent features (e.g., image alt generation) BULL_AUTH_KEY= @ -LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages diff --git a/apps/api/.env.example b/apps/api/.env.example index 6ba49daa..d54c696d 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -33,8 +33,6 @@ SCRAPING_BEE_API_KEY= # add for LLM dependednt features (image alt generation, etc.) OPENAI_API_KEY= BULL_AUTH_KEY=@ -# use if you're configuring basic logging with logtail -LOGTAIL_KEY= # set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= # set if you'd like to send slack server health status messages @@ -54,9 +52,6 @@ STRIPE_PRICE_ID_STANDARD_NEW_YEARLY= STRIPE_PRICE_ID_GROWTH= STRIPE_PRICE_ID_GROWTH_YEARLY= -HYPERDX_API_KEY= -HDX_NODE_BETA_MODE=1 - # set if you'd like to use the fire engine closed beta FIRE_ENGINE_BETA_URL= diff --git a/apps/api/jest.setup.js b/apps/api/jest.setup.js index c158ca42..0b3b09b7 100644 --- a/apps/api/jest.setup.js +++ b/apps/api/jest.setup.js @@ -1 +1 @@ -global.fetch = require('jest-fetch-mock'); +// global.fetch = require('jest-fetch-mock'); diff --git a/apps/api/package.json b/apps/api/package.json index a0f9cf8e..bb4ea268 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -32,9 +32,11 @@ "@tsconfig/recommended": "^1.0.3", "@types/body-parser": "^1.19.2", "@types/cors": "^2.8.13", + "@types/escape-html": "^1.0.4", "@types/express": "^4.17.17", "@types/jest": "^29.5.12", "@types/node": "^20.14.1", + "@types/pdf-parse": "^1.1.4", "body-parser": "^1.20.1", "express": "^4.18.2", "jest": "^29.6.3", @@ -53,9 +55,7 @@ "@bull-board/api": "^5.20.5", "@bull-board/express": "^5.20.5", "@devil7softwares/pos": "^1.0.2", - "@dqbd/tiktoken": "^1.0.13", - "@hyperdx/node-opentelemetry": "^0.8.1", - "@logtail/node": "^0.4.12", + "@dqbd/tiktoken": "^1.0.16", "@nangohq/node": "^0.40.8", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", @@ -78,6 +78,7 @@ "date-fns": "^3.6.0", "dotenv": "^16.3.1", "dotenv-cli": "^7.4.2", + "escape-html": "^1.0.3", "express-rate-limit": "^7.3.1", "express-ws": "^5.0.2", "form-data": "^4.0.0", @@ -92,6 +93,7 @@ "languagedetect": "^2.0.0", "logsnag": "^1.0.0", "luxon": "^3.4.3", + "marked": "^14.1.2", "md5": "^2.3.0", "moment": "^2.29.4", "mongoose": "^8.4.4", @@ -114,6 +116,8 @@ "typesense": "^1.5.4", "unstructured-client": "^0.11.3", "uuid": "^10.0.0", + "winston": "^3.14.2", + "winston-transport": "^4.8.0", "wordpos": "^2.1.0", "ws": "^8.18.0", "xml2js": "^0.6.2", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 095b507c..3350c74e 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -24,14 +24,8 @@ importers: specifier: ^1.0.2 version: 1.0.2 '@dqbd/tiktoken': - specifier: ^1.0.13 - version: 1.0.15 - '@hyperdx/node-opentelemetry': - specifier: ^0.8.1 - version: 0.8.1 - '@logtail/node': - specifier: ^0.4.12 - version: 0.4.21 + specifier: ^1.0.16 + version: 1.0.16 '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 @@ -98,6 +92,9 @@ importers: dotenv-cli: specifier: ^7.4.2 version: 7.4.2 + escape-html: + specifier: ^1.0.3 + version: 1.0.3 express-rate-limit: specifier: ^7.3.1 version: 7.3.1(express@4.19.2) @@ -140,6 +137,9 @@ importers: luxon: specifier: ^3.4.3 version: 3.4.4 + marked: + specifier: ^14.1.2 + version: 14.1.2 md5: specifier: ^2.3.0 version: 2.3.0 @@ -206,6 +206,12 @@ importers: uuid: specifier: ^10.0.0 version: 10.0.0 + winston: + specifier: ^3.14.2 + version: 3.14.2 + winston-transport: + specifier: ^4.8.0 + version: 4.8.0 wordpos: specifier: ^2.1.0 version: 2.1.0 @@ -237,6 +243,9 @@ importers: '@types/cors': specifier: ^2.8.13 version: 2.8.17 + '@types/escape-html': + specifier: ^1.0.4 + version: 1.0.4 '@types/express': specifier: ^4.17.17 version: 4.17.21 @@ -246,6 +255,9 @@ importers: '@types/node': specifier: ^20.14.1 version: 20.14.1 + '@types/pdf-parse': + specifier: ^1.1.4 + version: 1.1.4 body-parser: specifier: ^1.20.1 version: 1.20.2 @@ -494,42 +506,22 @@ packages: resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==} engines: {node: '>=12'} + '@dabh/diagnostics@2.0.3': + resolution: {integrity: sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==} + '@devil7softwares/pos@1.0.2': resolution: {integrity: sha512-49Ke26+++Ix8C5LChi4uga7aWgMuc5zV1NqjGxXxE7DralZwe+hvUuSJmBDWS+HHZaK9rFzLNdufV4HAvvOxPA==} engines: {node: '>=0'} deprecated: This package has been renamed to `fast-tag-pos` - '@dqbd/tiktoken@1.0.15': - resolution: {integrity: sha512-a6I67K1xUkuqcuwulobIJiLikkoE7egMaviI1Jg5bxSn2V7QGqXsGE3jTKr8UIOU/o74mAAd5TkeXFNBtaKF4A==} + '@dqbd/tiktoken@1.0.16': + resolution: {integrity: sha512-4uIrs5qxAwFVFFEP507HZIZhGOsgfaEMEWDXWalr+v+XP+wJwP60EVmkZtQyQe70IsKGVkx5umBxw4NfmU0pPg==} '@flydotio/dockerfile@0.4.11': resolution: {integrity: sha512-L52UAfrOhmAn3T4TxpeRofQOSO+Kctg+uraB4nLzo4mvvh+4Z7HYxSi7Dnq0Kirz+xx6fDIc4OMNT1EdaORecA==} engines: {node: '>=16.0.0'} hasBin: true - '@grpc/grpc-js@1.10.10': - resolution: {integrity: sha512-HPa/K5NX6ahMoeBv15njAc/sfF4/jmiXLar9UlC2UfHFKZzsCVLc3wbe7+7qua7w9VPh2/L6EBxyAV7/E8Wftg==} - engines: {node: '>=12.10.0'} - - '@grpc/proto-loader@0.7.13': - resolution: {integrity: sha512-AiXO/bfe9bmxBjxxtYxFAXGZvMaN5s8kO+jBHAJCON8rJoB5YS/D6X7ZNc6XQkuHNmyl4CYaMI1fJ/Gn27RGGw==} - engines: {node: '>=6'} - hasBin: true - - '@hyperdx/instrumentation-exception@0.1.0': - resolution: {integrity: sha512-Jgk7JY5J07Mq9fgXApGVhSkS4+WdzzRcWLReAZhxgo46KShxE6w614mFqUSnuo+z6ghlehsy4ForViUfxrFyew==} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@hyperdx/instrumentation-sentry-node@0.1.0': - resolution: {integrity: sha512-n8d/K/8M2owL2w4FNfV+lSVW6yoznEj5SdRCysV/ZIfyrZwpijiiSn7gkRcrOfKHmrxrupyp7DVg5L19cGuH6A==} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@hyperdx/node-opentelemetry@0.8.1': - resolution: {integrity: sha512-wNw0yQf54j/9KXVWeEOu8G6C5FT5EFlrz4dcmscTkwCvo6fQOLRZa/NbGcqugt0LSFMc0/6/Q5RDWVqDpEn0LQ==} - hasBin: true - '@ioredis/commands@1.2.0': resolution: {integrity: sha512-Sx1pU8EM64o2BrqNpEO1CNLtKQwyhuXuqyfH7oGKCk+1a33d2r5saW8zNwm3j6BTExtjrv2BxTgzzkMwts6vGg==} @@ -636,9 +628,6 @@ packages: '@jridgewell/trace-mapping@0.3.9': resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==} - '@js-sdsl/ordered-map@4.4.2': - resolution: {integrity: sha512-iUKgm52T8HOE/makSxjqoWhe95ZJA1/G1sYsGev2JDKUSS14KAgg1LHb+Ba+IPow0xflbnSkOsZcO08C7w1gYw==} - '@langchain/core@0.2.12': resolution: {integrity: sha512-zaKvUcWU1Cxcpd/fxklygY6iUrxls10KTRzyHZGBAIKJq1JD/B10vX59YlFgBs7nqqVTEvaChfIE0O0e2qBttA==} engines: {node: '>=18'} @@ -651,28 +640,12 @@ packages: resolution: {integrity: sha512-cXWgKE3sdWLSqAa8ykbCcUsUF1Kyr5J3HOWYGuobhPEycXW4WI++d5DhzdpL238mzoEXTi90VqfSCra37l5YqA==} engines: {node: '>=18'} - '@logtail/core@0.4.21': - resolution: {integrity: sha512-QDq194+24bwi4e+a/pxyf4X67NewhTvBmh9iwM2NhbSVSQz4Fo8xQn1Ul8zuUrXETycu/Od2D8wT2tZFNFx/7A==} - - '@logtail/node@0.4.21': - resolution: {integrity: sha512-zpwkhJgcYaM+vsjotHRJthc0ot1vP0CAVy+fwrkL8XjfdC3NHiWb6f0agQpHlqdRX8RTsAbcYpWNXKPpFB5U9Q==} - - '@logtail/tools@0.4.21': - resolution: {integrity: sha512-xIaolScUwJEikllopGphxBX0lVlN/rA8pLAZiNCMNJXpPbwitoFKLW3w4qRuYdKoFCCJZKwOdwEqU2Fv0i9Cuw==} - - '@logtail/types@0.4.20': - resolution: {integrity: sha512-nYsum10eJMTo+ySBlYXvSrvgD1NDCVUeOlxLBbelq3XUmHu9L48VNR3P0BOmhLamYCTEgjatTj0PyPLfjL1W9g==} - '@mixmark-io/domino@2.2.0': resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} '@mongodb-js/saslprep@1.1.7': resolution: {integrity: sha512-dCHW/oEX0KJ4NjDULBo3JiOaK5+6axtpBbS+ao2ZInoAL9/YRQLhXzSNAFz7hP4nzLkIqsfYAK/PDE3+XHny0Q==} - '@msgpack/msgpack@2.8.0': - resolution: {integrity: sha512-h9u4u/jiIRKbq25PM+zymTyW6bhTzELvOoUd+AvYriWOAKpLGnIamaET3pnHYoI5iYphAHBI4ayx0MehR+VVPQ==} - engines: {node: '>= 10'} - '@msgpackr-extract/msgpackr-extract-darwin-arm64@3.0.3': resolution: {integrity: sha512-QZHtlVgbAdy2zAqNA9Gu1UpIuI8Xvsd1v8ic6B2pZmeFnFcMWiPLfWXh7TVw4eGEZ/C9TH281KwhVoeQUKbyjw==} cpu: [arm64] @@ -710,10 +683,6 @@ packages: '@one-ini/wasm@0.1.1': resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==} - '@opentelemetry/api-logs@0.51.1': - resolution: {integrity: sha512-E3skn949Pk1z2XtXu/lxf6QAZpawuTM/IUEXcAzpiUkTd73Hmvw26FiN3cJuTmkpM5hZzHwkomVdtrh/n/zzwA==} - engines: {node: '>=14'} - '@opentelemetry/api-logs@0.52.1': resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==} engines: {node: '>=14'} @@ -722,567 +691,148 @@ packages: resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==} engines: {node: '>=8.0.0'} - '@opentelemetry/auto-instrumentations-node@0.46.1': - resolution: {integrity: sha512-s0CwmY9KYtPawOhV5YO2Gf62uVOQRNvT6Or8IZ0S4gr/kPVNhoMehTsQvqBwSWQfoFrkmW3KKOHiKJEp4dVGXg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.4.1 - - '@opentelemetry/context-async-hooks@1.24.1': - resolution: {integrity: sha512-R5r6DO4kgEOVBxFXhXjwospLQkv+sYxwCfjvoZBe7Zm6KKXAV9kDSJhi/D1BweowdZmO+sdbENLs374gER8hpQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.0.0 <1.9.0' - '@opentelemetry/context-async-hooks@1.25.1': resolution: {integrity: sha512-UW/ge9zjvAEmRWVapOP0qyCvPulWU6cQxGxDbWEFfGOj1VBBZAuOqTo3X6yWmDTD3Xe15ysCZChHncr2xFMIfQ==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': '>=1.0.0 <1.10.0' - '@opentelemetry/core@1.24.1': - resolution: {integrity: sha512-wMSGfsdmibI88K9wB498zXY04yThPexo8jvwNNlm542HZB7XrrMRBbAyKJqG8qDRJwIBdBrPMi4V9ZPW/sqrcg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.0.0 <1.9.0' - '@opentelemetry/core@1.25.1': resolution: {integrity: sha512-GeT/l6rBYWVQ4XArluLVB6WWQ8flHbdb6r2FCHC3smtdOAbrJBIv35tpV/yp9bmYUJf+xmZpu9DRTIeJVhFbEQ==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': '>=1.0.0 <1.10.0' - '@opentelemetry/exporter-logs-otlp-http@0.51.1': - resolution: {integrity: sha512-cd6GZ9IqCrmvOJwi1HjRR7o9ihF7xhZTekgxUsoyTsPF+SjKMsLF9ur6HeBYkYhk+YjZ1ken3XUMH47oUTvu8Q==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/exporter-metrics-otlp-http@0.51.1': - resolution: {integrity: sha512-oFXvif9iksHUxrzG3P8ohMLt7xSrl+oDMqxD/3XXndU761RFAKSbRDpfrQs25U5D+A2aMV3qk+4kfUWdJhZ77g==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/exporter-metrics-otlp-proto@0.51.1': - resolution: {integrity: sha512-jhj8xD6S4cydXGCuf2tp56+4QI0DbDH6g+0MiPPJVdXjxLj+iycQuqB2cwljWpByblFaOjyUsL/VKtm8C7sQ9A==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/exporter-trace-otlp-grpc@0.51.1': - resolution: {integrity: sha512-P9+Hkszih95ITvldGZ+kXvj9HpD1QfS+PwooyHK72GYA+Bgm+yUSAsDkUkDms8+s9HW6poxURv3LcjaMuBBpVQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/exporter-trace-otlp-http@0.51.1': - resolution: {integrity: sha512-n+LhLPsX07URh+HhV2SHVSvz1t4G/l/CE5BjpmhAPqeTceFac1VpyQkavWEJbvnK5bUEXijWt4LxAxFpt2fXyw==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/exporter-trace-otlp-proto@0.51.1': - resolution: {integrity: sha512-SE9f0/6V6EeXC9i+WA4WFjS1EYgaBCpAnI5+lxWvZ7iO7EU1IvHvZhP6Kojr0nLldo83gqg6G7OWFqsID3uF+w==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/exporter-zipkin@1.24.1': - resolution: {integrity: sha512-+Rl/VFmu2n6eaRMnVbyfZx1DqR/1KNyWebYuHyQBZaEAVIn/ZLgmofRpXN1X2nhJ4BNaptQUNxAstCYYz6dKoQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/instrumentation-amqplib@0.37.0': - resolution: {integrity: sha512-XjOHeAOreh0XX4jlzTTUWWqu1dIGvMWM8yvd43JJdRMAmTZisezjKsxLjMEMIvF0PzQdoXwh9DiS9nYE4/QmpA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-aws-lambda@0.41.1': - resolution: {integrity: sha512-/BLG+0DQr2tCILFGJKJH2Fg6eyjhqOlVflYpNddUEXnzyQ/PAhTdgirkqbICFgeSW2XYcEY9zXpuRldrVNw9cA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-aws-sdk@0.41.0': - resolution: {integrity: sha512-7+8WMY0LQeqv6KIObXK+Py44qNFLeCU0ZLLxSZtXEbZ2wJlQISP1St65jRto0NV7isnZoyuOxb2+ZpypPPNv7Q==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-bunyan@0.38.0': - resolution: {integrity: sha512-ThNcgTE22W7PKzTzz5qfGxb5Gf7rA3EORousYo2nJWHHcF6gqiMNv2+GXY3MdpjLBr8IgCfhtvbQdD6rlIPUpA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-cassandra-driver@0.38.0': - resolution: {integrity: sha512-ML4Vw0it2uIpETfX6skuSIGLHF9D3TUKOfdfrk9lnrzzWSzg2aS6pl3UeepkQX4wXHdzlxVRB0USrUqsmxMd5Q==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-connect@0.36.1': - resolution: {integrity: sha512-xI5Q/CMmzBmHshPnzzjD19ptFaYO/rQWzokpNio4QixZYWhJsa35QgRvN9FhPkwgtuJIbt/CWWAufJ3egJNHEA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-connect@0.38.0': resolution: {integrity: sha512-2/nRnx3pjYEmdPIaBwtgtSviTKHWnDZN3R+TkRUnhIVrvBKVcq+I5B2rtd6mr6Fe9cHlZ9Ojcuh7pkNh/xdWWg==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-cucumber@0.6.0': - resolution: {integrity: sha512-90eAF2JPSbPAsOuGfYyctYaoYXqy4Clbxt0j/uUgg6dto4oqwUw3AvTyHQEztLGxeXwEzC1EQigDtVPg5ZexYA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/instrumentation-dataloader@0.9.0': - resolution: {integrity: sha512-fiyCOAw+tlbneok1x7P5UseoGW5nS60CWWx7NXzYW+WOexpSmDQQW7olttGa8fqE6/sVCoi1l+QdfVoETZi/NQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-dns@0.36.1': - resolution: {integrity: sha512-NWRbQ7q0E3co/CNTWLZZvUzZoKhB1iTitY282IM8HDTXkA6VRssCfOcvaHw5ezOh23TJbAeYxmmpVj4hFvDPYQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-express@0.39.0': - resolution: {integrity: sha512-AG8U7z7D0JcBu/7dDcwb47UMEzj9/FMiJV2iQZqrsZnxR3FjB9J9oIH2iszJYci2eUdp2WbdvtpD9RV/zmME5A==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-express@0.41.1': resolution: {integrity: sha512-uRx0V3LPGzjn2bxAnV8eUsDT82vT7NTwI0ezEuPMBOTOsnPpGhWdhcdNdhH80sM4TrWrOfXm9HGEdfWE3TRIww==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-fastify@0.36.1': - resolution: {integrity: sha512-3Nfm43PI0I+3EX+1YbSy6xbDu276R1Dh1tqAk68yd4yirnIh52Kd5B+nJ8CgHA7o3UKakpBjj6vSzi5vNCzJIA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-fastify@0.38.0': resolution: {integrity: sha512-HBVLpTSYpkQZ87/Df3N0gAw7VzYZV3n28THIBrJWfuqw3Or7UqdhnjeuMIPQ04BKk3aZc0cWn2naSQObbh5vXw==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-fs@0.12.0': - resolution: {integrity: sha512-Waf+2hekJRxIwq1PmivxOWLdMOtYbY22hKr34gEtfbv2CArSv8FBJH4BmQxB9o5ZcwkdKu589qs009dbuSfNmQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-fs@0.14.0': resolution: {integrity: sha512-pVc8P5AgliC1DphyyBUgsxXlm2XaPH4BpYvt7rAZDMIqUpRk8gs19SioABtKqqxvFzg5jPtgJfJsdxq0Y+maLw==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-generic-pool@0.36.0': - resolution: {integrity: sha512-CExAEqJvK8jYxrhN8cl6EaGg57EGJi+qsSKouLC5lndXi68gZLOKbZIMZg4pF0kNfp/D4BFaGmA6Ap7d5WoPTw==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-graphql@0.40.0': - resolution: {integrity: sha512-LVRdEHWACWOczv2imD+mhUrLMxsEjPPi32vIZJT57zygR5aUiA4em8X3aiGOCycgbMWkIu8xOSGSxdx3JmzN+w==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-graphql@0.42.0': resolution: {integrity: sha512-N8SOwoKL9KQSX7z3gOaw5UaTeVQcfDO1c21csVHnmnmGUoqsXbArK2B8VuwPWcv6/BC/i3io+xTo7QGRZ/z28Q==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-grpc@0.51.1': - resolution: {integrity: sha512-coRTugFL7De/VNH/1NqPlxnfik87jS+jBXsny+Y/lMhXIA3x8t71IyL9ihuewkD+lNtIxIz6Y7Sq6kPuOqz5dQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-hapi@0.38.0': - resolution: {integrity: sha512-ZcOqEuwuutTDYIjhDIStix22ECblG/i9pHje23QGs4Q4YS4RMaZ5hKCoQJxW88Z4K7T53rQkdISmoXFKDV8xMg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-hapi@0.40.0': resolution: {integrity: sha512-8U/w7Ifumtd2bSN1OLaSwAAFhb9FyqWUki3lMMB0ds+1+HdSxYBe9aspEJEgvxAqOkrQnVniAPTEGf1pGM7SOw==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-http@0.51.1': - resolution: {integrity: sha512-6b3nZnFFEz/3xZ6w8bVxctPUWIPWiXuPQ725530JgxnN1cvYFd8CJ75PrHZNjynmzSSnqBkN3ef4R9N+RpMh8Q==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-http@0.52.1': resolution: {integrity: sha512-dG/aevWhaP+7OLv4BQQSEKMJv8GyeOp3Wxl31NHqE8xo9/fYMfEljiZphUHIfyg4gnZ9swMyWjfOQs5GUQe54Q==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-ioredis@0.40.0': - resolution: {integrity: sha512-Jv/fH7KhpWe4KBirsiqeUJIYrsdR2iu2l4nWhfOlRvaZ+zYIiLEzTQR6QhBbyRoAbU4OuYJzjWusOmmpGBnwng==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-ioredis@0.42.0': resolution: {integrity: sha512-P11H168EKvBB9TUSasNDOGJCSkpT44XgoM6d3gRIWAa9ghLpYhl0uRkS8//MqPzcJVHr3h3RmfXIpiYLjyIZTw==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-knex@0.36.1': - resolution: {integrity: sha512-6bEuiI+yMf3D0+ZWZE2AKmXhIhBvZ0brdO/0A8lUqeqeS+sS4fTcjA1F2CclsCNxYWEgcs8o3QyQqPceBeVRlg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-koa@0.40.0': - resolution: {integrity: sha512-dJc3H/bKMcgUYcQpLF+1IbmUKus0e5Fnn/+ru/3voIRHwMADT3rFSUcGLWSczkg68BCgz0vFWGDTvPtcWIFr7A==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-koa@0.42.0': resolution: {integrity: sha512-H1BEmnMhho8o8HuNRq5zEI4+SIHDIglNB7BPKohZyWG4fWNuR7yM4GTlR01Syq21vODAS7z5omblScJD/eZdKw==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-lru-memoizer@0.37.0': - resolution: {integrity: sha512-dHLrn55qVWsHJQYdForPWPUWDk2HZ2jjzkT+WoQSqpYT1j4HxfoiLfBTF+I3EbEYFAJnDRmRAUfA6nU5GPdCLQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-memcached@0.36.0': - resolution: {integrity: sha512-5efkT8ZfN8il5z+yfKYFGm2YR3mhlhaJoGfNOAylKE/6tUH3WDTTWaP7nrURtWGc+fuvDktcEch18Se8qsGS7w==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-mongodb@0.43.0': - resolution: {integrity: sha512-bMKej7Y76QVUD3l55Q9YqizXybHUzF3pujsBFjqbZrRn2WYqtsDtTUlbCK7fvXNPwFInqZ2KhnTqd0gwo8MzaQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-mongodb@0.46.0': resolution: {integrity: sha512-VF/MicZ5UOBiXrqBslzwxhN7TVqzu1/LN/QDpkskqM0Zm0aZ4CVRbUygL8d7lrjLn15x5kGIe8VsSphMfPJzlA==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-mongoose@0.38.1': - resolution: {integrity: sha512-zaeiasdnRjXe6VhYCBMdkmAVh1S5MmXC/0spet+yqoaViGnYst/DOxPvhwg3yT4Yag5crZNWsVXnA538UjP6Ow==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-mongoose@0.40.0': resolution: {integrity: sha512-niRi5ZUnkgzRhIGMOozTyoZIvJKNJyhijQI4nF4iFSb+FUx2v5fngfR+8XLmdQAO7xmsD8E5vEGdDVYVtKbZew==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-mysql2@0.38.1': - resolution: {integrity: sha512-qkpHMgWSDTYVB1vlZ9sspf7l2wdS5DDq/rbIepDwX5BA0N0068JTQqh0CgAh34tdFqSCnWXIhcyOXC2TtRb0sg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-mysql2@0.40.0': resolution: {integrity: sha512-0xfS1xcqUmY7WE1uWjlmI67Xg3QsSUlNT+AcXHeA4BDUPwZtWqF4ezIwLgpVZfHOnkAEheqGfNSWd1PIu3Wnfg==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-mysql@0.38.1': - resolution: {integrity: sha512-+iBAawUaTfX/HAlvySwozx0C2B6LBfNPXX1W8Z2On1Uva33AGkw2UjL9XgIg1Pj4eLZ9R4EoJ/aFz+Xj4E/7Fw==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-mysql@0.40.0': resolution: {integrity: sha512-d7ja8yizsOCNMYIJt5PH/fKZXjb/mS48zLROO4BzZTtDfhNCl2UM/9VIomP2qkGIFVouSJrGr/T00EzY7bPtKA==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-nestjs-core@0.37.1': - resolution: {integrity: sha512-ebYQjHZEmGHWEALwwDGhSQVLBaurFnuLIkZD5igPXrt7ohfF4lc5/4al1LO+vKc0NHk8SJWStuRueT86ISA8Vg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-nestjs-core@0.39.0': resolution: {integrity: sha512-mewVhEXdikyvIZoMIUry8eb8l3HUjuQjSjVbmLVTt4NQi35tkpnHQrG9bTRBrl3403LoWZ2njMPJyg4l6HfKvA==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-net@0.36.0': - resolution: {integrity: sha512-rZlbSgwAJys8lpug+xIeAdO98ypYMAPVqrHqc4AHuUl5S4MULHEcjGLMZLoE/guEGO4xAQ5XUezpRFGM1SAnsg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-pg@0.41.0': - resolution: {integrity: sha512-BSlhpivzBD77meQNZY9fS4aKgydA8AJBzv2dqvxXFy/Hq64b7HURgw/ztbmwFeYwdF5raZZUifiiNSMLpOJoSA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-pg@0.43.0': resolution: {integrity: sha512-og23KLyoxdnAeFs1UWqzSonuCkePUzCX30keSYigIzJe/6WSYA8rnEI5lobcxPEzg+GcU06J7jzokuEHbjVJNw==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-pino@0.39.0': - resolution: {integrity: sha512-uA17F2iP77o3NculB63QD2zv3jkJ093Gfb0GxHLEqTIqpYs1ToJ53ybWwjJwqFByxk7GrliaxaxVtWC23PKzBg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-redis-4@0.39.0': - resolution: {integrity: sha512-Zpfqfi83KeKgVQ0C2083GZPon3ZPYQ5E59v9FAbhubtOoUb9Rh7n111YD8FPW3sgx6JKp1odXmBmfQhWCaTOpQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-redis-4@0.41.0': resolution: {integrity: sha512-H7IfGTqW2reLXqput4yzAe8YpDC0fmVNal95GHMLOrS89W+qWUKIqxolSh63hJyfmwPSFwXASzj7wpSk8Az+Dg==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation-redis@0.39.1': - resolution: {integrity: sha512-HUjTerD84jRJnSyDrRPqn6xQ7K91o9qLflRPZqzRvq0GRj5PMfc6TJ/z3q/ayWy/2Kzffhrp7HCIVp0u0TkgUg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-restify@0.38.0': - resolution: {integrity: sha512-VYK47Z9GBaZX5MQLL7kZDdzQDdyUtHRD4J/GSr6kdwmIpdpUQXLsV3EnboeB8P+BlpucF57FyJKE8yWTOEMfnA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-router@0.37.0': - resolution: {integrity: sha512-+OPcm7C9I5oPqnpStE+1WkdPWjRx0k5XKratxQmIDFZrmhRcqvMte3vrrzE/OBPg9iqh2tKrSe0y7+0sRfTJyQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-runtime-node@0.4.0': - resolution: {integrity: sha512-/NOgUF5gf3T5c3GMyy6fnQxaVzbOf9j2xcetgymIIX2HSN3Gk7o64G7KDvwHwhaa20ZiF0QDLb3m4AT+tn9eRg==} - engines: {node: '>=14.10.0'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-socket.io@0.39.0': - resolution: {integrity: sha512-4J2ehk5mJyDT6j2yJCOuPxAjit5QB1Fwzhx0LID5jjvhI9LxzZIGDNAPTTHyghSiaRDeNMzceXKkkEQJkg2MNw==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-tedious@0.10.1': - resolution: {integrity: sha512-maSXMxgS0szU52khQzAROV4nWr+3M8mZajMQOc3/7tYjo+Q3HlWAowOuagPvp4pwROK4x6oDaFYlY+ZSj1qjYA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - - '@opentelemetry/instrumentation-undici@0.2.0': - resolution: {integrity: sha512-RH9WdVRtpnyp8kvya2RYqKsJouPxvHl7jKPsIfrbL8u2QCKloAGi0uEqDHoOS15ZRYPQTDXZ7d8jSpUgSQmvpA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.7.0 - - '@opentelemetry/instrumentation-winston@0.37.0': - resolution: {integrity: sha512-vOx55fxdNjo2XojJf8JN4jP7VVvQCh7UQzzQ2Q2FpGJpt8Z3EErKaY8xOBkOuJH0TtL/Q72rmIn9c+mRG46BxA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation@0.46.0': resolution: {integrity: sha512-a9TijXZZbk0vI5TGLZl+0kxyFfrXHhX6Svtz7Pp2/VBlCSKrazuULEyoJQrOknJyFWNMEmbbJgOciHCCpQcisw==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation@0.51.1': - resolution: {integrity: sha512-JIrvhpgqY6437QIqToyozrUG1h5UhwHkaGK/WAX+fkrpyPtc+RO5FkRtUd9BH0MibabHHvqsnBGKfKVijbmp8w==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.3.0 - '@opentelemetry/instrumentation@0.52.1': resolution: {integrity: sha512-uXJbYU/5/MBHjMp1FqrILLRuiJCs3Ofk0MeRDk8g1S1gD47U8X3JnSwcMO1rtRo1x1a7zKaQHaoYu49p/4eSKw==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': ^1.3.0 - '@opentelemetry/otlp-exporter-base@0.51.1': - resolution: {integrity: sha512-UYlnOYyDdzo1Gw559EHCzru0RwhvuXCwoH8jGo9J4gO1TE58GjnEmIjomMsKBCym3qWNJfIQXw+9SZCV0DdQNg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/otlp-grpc-exporter-base@0.51.1': - resolution: {integrity: sha512-ZAS+4pq8o7dsugGTwV9s6JMKSxi+guIHdn0acOv0bqj26e9pWDFx5Ky+bI0aY46uR9Y0JyXqY+KAEYM/SO3DFA==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/otlp-proto-exporter-base@0.51.1': - resolution: {integrity: sha512-gxxxwfk0inDMb5DLeuxQ3L8TtptxSiTNHE4nnAJH34IQXAVRhXSXW1rK8PmDKDngRPIZ6J7ncUCjjIn8b+AgqQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/otlp-transformer@0.51.1': - resolution: {integrity: sha512-OppYOXwV9LQqqtYUCywqoOqX/JT9LQ5/FMuPZ//eTkvuHdUC4ZMwz2c6uSoT2R90GWvvGnF1iEqTGyTT3xAt2Q==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.3.0 <1.9.0' - - '@opentelemetry/propagation-utils@0.30.10': - resolution: {integrity: sha512-hhTW8pFp9PSyosYzzuUL9rdm7HF97w3OCyElufFHyUnYnKkCBbu8ne2LyF/KSdI/xZ81ubxWZs78hX4S7pLq5g==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/propagator-aws-xray@1.25.1': - resolution: {integrity: sha512-soZQdO9EAROMwa9bL2C0VLadbrfRjSA9t7g6X8sL0X1B8V59pzOayYMyTW9qTECn9uuJV98A7qOnJm6KH6yk8w==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.0.0 <1.10.0' - - '@opentelemetry/propagator-b3@1.24.1': - resolution: {integrity: sha512-nda97ZwhpZKyUJTXqQuKzNhPMUgMLunbbGWn8kroBwegn+nh6OhtyGkrVQsQLNdVKJl0KeB5z0ZgeWszrYhwFw==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.0.0 <1.9.0' - - '@opentelemetry/propagator-jaeger@1.24.1': - resolution: {integrity: sha512-7bRBJn3FG1l195A1m+xXRHvgzAOBsfmRi9uZ5Da18oTh7BLmNDiA8+kpk51FpTsU1PCikPVpRDNPhKVB6lyzZg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.0.0 <1.9.0' - '@opentelemetry/redis-common@0.36.2': resolution: {integrity: sha512-faYX1N0gpLhej/6nyp6bgRjzAKXn5GOEMYY7YhciSfCoITAktLUtQ36d24QEWNA1/WA1y6qQunCe0OhHRkVl9g==} engines: {node: '>=14'} - '@opentelemetry/resource-detector-alibaba-cloud@0.28.10': - resolution: {integrity: sha512-TZv/1Y2QCL6sJ+X9SsPPBXe4786bc/Qsw0hQXFsNTbJzDTGGUmOAlSZ2qPiuqAd4ZheUYfD+QA20IvAjUz9Hhg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/resource-detector-aws@1.5.1': - resolution: {integrity: sha512-+IUh4gAwJf49vOJM6PIjmgOapRH5zr21ZpFnNU0QZmxRi52AXVhZN7A89pKW6GAQheWnVQLD7iUN87ieYt70tw==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/resource-detector-azure@0.2.9': - resolution: {integrity: sha512-16Z6kyrmszoa7J1uj1kbSAgZuk11K07yEDj6fa3I9XBf8Debi8y4K8ex94kpxbCfEraWagXji3bCWvaq3k4dRg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/resource-detector-container@0.3.11': - resolution: {integrity: sha512-22ndMDakxX+nuhAYwqsciexV8/w26JozRUV0FN9kJiqSWtA1b5dCVtlp3J6JivG5t8kDN9UF5efatNnVbqRT9Q==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/resource-detector-gcp@0.29.10': - resolution: {integrity: sha512-rm2HKJ9lsdoVvrbmkr9dkOzg3Uk0FksXNxvNBgrCprM1XhMoJwThI5i0h/5sJypISUAJlEeJS6gn6nROj/NpkQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': ^1.0.0 - - '@opentelemetry/resources@1.24.1': - resolution: {integrity: sha512-cyv0MwAaPF7O86x5hk3NNgenMObeejZFLJJDVuSeSMIsknlsj3oOZzRv3qSzlwYomXsICfBeFFlxwHQte5mGXQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.0.0 <1.9.0' - '@opentelemetry/resources@1.25.1': resolution: {integrity: sha512-pkZT+iFYIZsVn6+GzM0kSX+u3MSLCY9md+lIJOoKl/P+gJFfxJte/60Usdp8Ce4rOs8GduUpSPNe1ddGyDT1sQ==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': '>=1.0.0 <1.10.0' - '@opentelemetry/sdk-logs@0.51.1': - resolution: {integrity: sha512-ULQQtl82b673PpZc5/0EtH4V+BrwVOgKJZEB7tYZnGTG3I98tQVk89S9/JSixomDr++F4ih+LSJTCqIKBz+MQQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.4.0 <1.9.0' - '@opentelemetry/api-logs': '>=0.39.1' - - '@opentelemetry/sdk-metrics@1.24.1': - resolution: {integrity: sha512-FrAqCbbGao9iKI+Mgh+OsC9+U2YMoXnlDHe06yH7dvavCKzE3S892dGtX54+WhSFVxHR/TMRVJiK/CV93GR0TQ==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.3.0 <1.9.0' - '@opentelemetry/sdk-metrics@1.25.1': resolution: {integrity: sha512-9Mb7q5ioFL4E4dDrc4wC/A3NTHDat44v4I3p2pLPSxRvqUbDIQyMVr9uK+EU69+HWhlET1VaSrRzwdckWqY15Q==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': '>=1.3.0 <1.10.0' - '@opentelemetry/sdk-node@0.51.1': - resolution: {integrity: sha512-GgmNF9C+6esr8PIJxCqHw84rEOkYm6XdFWZ2+Wyc3qaUt92ACoN7uSw5iKNvaUq62W0xii1wsGxwHzyENtPP8w==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.3.0 <1.9.0' - - '@opentelemetry/sdk-trace-base@1.24.1': - resolution: {integrity: sha512-zz+N423IcySgjihl2NfjBf0qw1RWe11XIAWVrTNOSSI6dtSPJiVom2zipFB2AEEtJWpv0Iz6DY6+TjnyTV5pWg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.0.0 <1.9.0' - '@opentelemetry/sdk-trace-base@1.25.1': resolution: {integrity: sha512-C8k4hnEbc5FamuZQ92nTOp8X/diCY56XUTnMiv9UTuJitCzaNNHAVsdm5+HLCdI8SLQsLWIrG38tddMxLVoftw==} engines: {node: '>=14'} peerDependencies: '@opentelemetry/api': '>=1.0.0 <1.10.0' - '@opentelemetry/sdk-trace-node@1.24.1': - resolution: {integrity: sha512-/FZX8uWaGIAwsDhqI8VvQ+qWtfMNlXjaFYGc+vmxgdRFppCSSIRwrPyIhJO1qx61okyYhoyxVEZAfoiNxrfJCg==} - engines: {node: '>=14'} - peerDependencies: - '@opentelemetry/api': '>=1.0.0 <1.9.0' - - '@opentelemetry/semantic-conventions@1.24.1': - resolution: {integrity: sha512-VkliWlS4/+GHLLW7J/rVBA00uXus1SWvwFvcUDxDwmFxYfg/2VI6ekwdXS28cjI8Qz2ky2BzG8OUHo+WeYIWqw==} - engines: {node: '>=14'} - '@opentelemetry/semantic-conventions@1.25.1': resolution: {integrity: sha512-ZDjMJJQRlyk8A1KZFCc+bCbsyrn1wTwdNt56F7twdfUfnHUZUq77/WfONCj8p72NZOyP7pNTdUWSTYC3GTbuuQ==} engines: {node: '>=14'} @@ -1306,36 +856,6 @@ packages: '@prisma/instrumentation@5.17.0': resolution: {integrity: sha512-c1Sle4ji8aasMcYfBBHFM56We4ljfenVtRmS8aY06BllS7SoU6SmJBwG7vil+GHiR0Yrh+t9iBwt4AY0Jr4KNQ==} - '@protobufjs/aspromise@1.1.2': - resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==} - - '@protobufjs/base64@1.1.2': - resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==} - - '@protobufjs/codegen@2.0.4': - resolution: {integrity: sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==} - - '@protobufjs/eventemitter@1.1.0': - resolution: {integrity: sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==} - - '@protobufjs/fetch@1.1.0': - resolution: {integrity: sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==} - - '@protobufjs/float@1.0.2': - resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==} - - '@protobufjs/inquire@1.1.0': - resolution: {integrity: sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==} - - '@protobufjs/path@1.1.2': - resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==} - - '@protobufjs/pool@1.1.0': - resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==} - - '@protobufjs/utf8@1.1.0': - resolution: {integrity: sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==} - '@puppeteer/browsers@2.2.3': resolution: {integrity: sha512-bJ0UBsk0ESOs6RFcLXOt99a3yTDcOKlzfjad+rhFwdaG1Lu/Wzq58GHYCDTlZ9z6mldf4g+NTb+TXEfe0PpnsQ==} engines: {node: '>=18'} @@ -1423,10 +943,6 @@ packages: engines: {node: '>= 10'} hasBin: true - '@sentry/core@8.13.0': - resolution: {integrity: sha512-N9Qg4ZGxZWp8eb2eUUHVVKgjBLtFIjS805nG92s6yJmkvOpKm6mLtcUaT/iDf3Hta6nG+xRkhbE3r+Z4cbXG8w==} - engines: {node: '>=14.18'} - '@sentry/core@8.26.0': resolution: {integrity: sha512-g/tVmTZD4GNbLFf++hKJfBpcCAtduFEMLnbfa9iT/QEZjlmP+EzY+GsH9bafM5VsNe8DiOUp+kJKWtShzlVdBA==} engines: {node: '>=14.18'} @@ -1450,18 +966,10 @@ packages: engines: {node: '>=14.18'} hasBin: true - '@sentry/types@8.13.0': - resolution: {integrity: sha512-r63s/H5gvQnQM9tTGBXz2xErUbxZALh4e2Lg/1aHj4zIvGLBjA2z5qWsh6TEZYbpmgAyGShLDr6+rWeUVf9yBQ==} - engines: {node: '>=14.18'} - '@sentry/types@8.26.0': resolution: {integrity: sha512-zKmh6SWsJh630rpt7a9vP4Cm4m1C2gDTUqUiH565CajCL/4cePpNWYrNwalSqsOSL7B9OrczA1+n6a6XvND+ng==} engines: {node: '>=14.18'} - '@sentry/utils@8.13.0': - resolution: {integrity: sha512-PxV0v9VbGWH9zP37P5w2msLUFDr287nYjoY2XVF+RSolyiTs1CQNI5ZMUO3o4MsSac/dpXxjyrZXQd72t/jRYA==} - engines: {node: '>=14.18'} - '@sentry/utils@8.26.0': resolution: {integrity: sha512-xvlPU9Hd2BlyT+FhWHGNwnxWqdVRk2AHnDtVcW4Ma0Ri5EwS+uy4Jeik5UkSv8C5RVb9VlxFmS8LN3I1MPJsLw==} engines: {node: '>=14.18'} @@ -1613,12 +1121,6 @@ packages: '@tsconfig/recommended@1.0.6': resolution: {integrity: sha512-0IKu9GHYF1NGTJiYgfWwqnOQSlnE9V9R7YohHNNf0/fj/SyOZWzdd06JFr0fLpg1Mqw0kGbYg8w5xdkSqLKM9g==} - '@types/accepts@1.3.7': - resolution: {integrity: sha512-Pay9fq2lM2wXPWbteBsRAGiWH2hig4ZE2asK+mm7kUzlxRTfL961rj89I6zV/E3PcIkDqyuBEcMxFT7rccugeQ==} - - '@types/aws-lambda@8.10.122': - resolution: {integrity: sha512-vBkIh9AY22kVOCEKo5CJlyCgmSWvasC+SWUxL/x/vOwRobMpI/HG1xp/Ae3AqmSiZeLUbOhW0FCD3ZjqqUxmXw==} - '@types/babel__core@7.20.5': resolution: {integrity: sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==} @@ -1634,24 +1136,18 @@ packages: '@types/body-parser@1.19.5': resolution: {integrity: sha512-fB3Zu92ucau0iQ0JMCFQE7b/dv8Ot07NI3KaZIkIUNXq82k4eBAqUaneXfleGY9JWskeS9y+u0nXMyspcuQrCg==} - '@types/bunyan@1.8.9': - resolution: {integrity: sha512-ZqS9JGpBxVOvsawzmVt30sP++gSQMTejCkIAQ3VdadOcRE8izTyW66hufvwLeH+YEGP6Js2AW7Gz+RMyvrEbmw==} - '@types/connect@3.4.36': resolution: {integrity: sha512-P63Zd/JUGq+PdrM1lv0Wv5SBYeA2+CORvbrXbngriYY0jzLUWfQMQQxOhjONEz/wlHOAxOdY7CY65rgQdTjq2w==} '@types/connect@3.4.38': resolution: {integrity: sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug==} - '@types/content-disposition@0.5.8': - resolution: {integrity: sha512-QVSSvno3dE0MgO76pJhmv4Qyi/j0Yk9pBp0Y7TJ2Tlj+KCgJWY6qX7nnxCOLkZ3VYRSIk1WTxCvwUSdx6CCLdg==} - - '@types/cookies@0.9.0': - resolution: {integrity: sha512-40Zk8qR147RABiQ7NQnBzWzDcjKzNrntB5BAmeGCb2p/MIyOE+4BVvc17wumsUqUw00bJYqoXFHYygQnEFh4/Q==} - '@types/cors@2.8.17': resolution: {integrity: sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==} + '@types/escape-html@1.0.4': + resolution: {integrity: sha512-qZ72SFTgUAZ5a7Tj6kf2SHLetiH5S6f8G5frB2SPQ3EyF02kxdyBFf4Tz4banE3xCgGnKgWLt//a6VuYHKYJTg==} + '@types/express-serve-static-core@4.19.3': resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==} @@ -1664,9 +1160,6 @@ packages: '@types/graceful-fs@4.1.9': resolution: {integrity: sha512-olP3sd1qOEe5dXTSaFvQG+02VdRXcdytWLAZsAq1PecU8uqQAhkrnbli7DagjtXKW/Bl7YJbUsa8MPcuc8LHEQ==} - '@types/http-assert@1.5.5': - resolution: {integrity: sha512-4+tE/lwdAahgZT1g30Jkdm9PzFRde0xwxBNUyRsCitRvCQB90iuA2uJYdUnhnANRcqGXaWOGY4FEoxeElNAK2g==} - '@types/http-errors@2.0.4': resolution: {integrity: sha512-D0CFMMtydbJAegzOyHjtiKPLlvnm3iTZyZRSZoLq2mRhDdmLfIWOCYPfQJ4cu2erKghU++QvjcUjp/5h7hESpA==} @@ -1682,21 +1175,6 @@ packages: '@types/jest@29.5.12': resolution: {integrity: sha512-eDC8bTvT/QhYdxJAulQikueigY5AsdBRH2yDKW3yveW7svY3+DzN84/2NUgkw10RTiJbWqZrTtoGVdYlvFJdLw==} - '@types/keygrip@1.0.6': - resolution: {integrity: sha512-lZuNAY9xeJt7Bx4t4dx0rYCDqGPW8RXhQZK1td7d4H6E9zYbLoOtjBvfwdTKpsyxQI/2jv+armjX/RW+ZNpXOQ==} - - '@types/koa-compose@3.2.8': - resolution: {integrity: sha512-4Olc63RY+MKvxMwVknCUDhRQX1pFQoBZ/lXcRLP69PQkEpze/0cr8LNqJQe5NFb/b19DWi2a5bTi2VAlQzhJuA==} - - '@types/koa@2.14.0': - resolution: {integrity: sha512-DTDUyznHGNHAl+wd1n0z1jxNajduyTh8R53xoewuerdBzGo6Ogj6F2299BFtrexJw4NtgjsI5SMPCmV9gZwGXA==} - - '@types/koa__router@12.0.3': - resolution: {integrity: sha512-5YUJVv6NwM1z7m6FuYpKfNLTZ932Z6EF6xy2BbtpJSyn13DKNQEkXVffFVSnJHxvwwWh2SAeumpjAYUELqgjyw==} - - '@types/memcached@2.2.10': - resolution: {integrity: sha512-AM9smvZN55Gzs2wRrqeMHVP7KE8KWgCJO/XL5yCly2xF6EKa4YlbpK+cLSAH4NG/Ah64HrlegmGqW8kYws7Vxg==} - '@types/mime@1.3.5': resolution: {integrity: sha512-/pyBZWSLD2n0dcHE3hq8s8ZvcETHtEuF+3E7XVt0Ig2nvsVQXdghHVcEkIWjy9A0wKfTn97a/PSDYohKIlnP/w==} @@ -1712,6 +1190,9 @@ packages: '@types/node@20.14.1': resolution: {integrity: sha512-T2MzSGEu+ysB/FkWfqmhV3PLyQlowdptmmgD20C6QxsS8Fmv5SjpZ1ayXaEC0S21/h5UJ9iA6W/5vSNU5l00OA==} + '@types/pdf-parse@1.1.4': + resolution: {integrity: sha512-+gbBHbNCVGGYw1S9lAIIvrHW47UYOhMIFUsJcMkMrzy1Jf0vulBN3XQIjPgnoOXveMuHnF3b57fXROnY/Or7eg==} + '@types/pg-pool@2.0.4': resolution: {integrity: sha512-qZAvkv1K3QbmHHFYSNRYPkRjOWRLBYrL4B9c+wG0GSVGBw0NtJwPcgx/DSddeDJvRGMHCEQ4VMEVfuJ/0gZ3XQ==} @@ -1739,15 +1220,9 @@ packages: '@types/shimmer@1.0.5': resolution: {integrity: sha512-9Hp0ObzwwO57DpLFF0InUjUm/II8GmKAvzbefxQTihCb7KI6yc9yzf0nLc4mVdby5N4DRCgQM2wCup9KTieeww==} - '@types/stack-trace@0.0.29': - resolution: {integrity: sha512-TgfOX+mGY/NyNxJLIbDWrO9DjGoVSW9+aB8H2yy1fy32jsvxijhmyJI9fDFgvz3YP4lvJaq9DzdR/M1bOgVc9g==} - '@types/stack-utils@2.0.3': resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==} - '@types/tedious@4.0.14': - resolution: {integrity: sha512-KHPsfX/FoVbUGbyYvk1q9MMQHLPeRZhRJZdO45Q4YjvFkv4hMNghCWTvy7rdKessBsmtz4euWCWAB6/tVpI1Iw==} - '@types/triple-beam@1.3.5': resolution: {integrity: sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==} @@ -1974,9 +1449,6 @@ packages: resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==} engines: {node: '>=10.0.0'} - bignumber.js@9.1.2: - resolution: {integrity: sha512-2/mKyZH9K85bzOEfhXDBFZTGd1CTs+5IHpeFQo9luiBG7hghdC851Pj2WAhb6E3R6b9tZj/XKhbg4fum+Kepug==} - bin-links@4.0.4: resolution: {integrity: sha512-cMtq4W5ZsEwcutJrVId+a/tjt8GSbS+h0oNkdl6+6rBuEv8Ot33Bevj5KPm40t309zuhVic8NjpuL42QCiJWWA==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -1988,9 +1460,6 @@ packages: binary-search@1.3.6: resolution: {integrity: sha512-nbE1WxOTTrUWIfsfZ4aHGYu5DOuNkbxGokjV6Z2kxfJK3uaAb8zNK1muzOeipoLHZjInT4Br88BHpzevc681xA==} - bl@4.1.0: - resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==} - bluebird@3.4.7: resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} @@ -2118,22 +1587,10 @@ packages: cjs-module-lexer@1.3.1: resolution: {integrity: sha512-a3KdPAANPbNE4ZUv9h6LckSl9zLsYOP4MBmhIPkRaeyybt+r4UghLvq+xw/YwUcC1gqylCkL4rdVs3Lwupjm4Q==} - cli-cursor@3.1.0: - resolution: {integrity: sha512-I/zHAwsKf9FqGoXM4WWRACob9+SNukZTd94DWF57E4toouRulbCxcUh6RKUEOQlYTHJnzkPMySvPNaaSLNfLZw==} - engines: {node: '>=8'} - - cli-spinners@2.9.2: - resolution: {integrity: sha512-ywqV+5MmyL4E7ybXgKys4DugZbX0FC6LnwrhjuykIjnK9k8OQacQ7axGKnjDXWNhns0xot3bZI5h55H8yo9cJg==} - engines: {node: '>=6'} - cliui@8.0.1: resolution: {integrity: sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==} engines: {node: '>=12'} - clone@1.0.4: - resolution: {integrity: sha512-JQHZ2QMW6l3aH/j6xCqQThY/9OH4D/9ls34cgkUBiEeocRTU04tHfKPBsUK1PqZCUQM7GiA0IIXJSuXHI64Kbg==} - engines: {node: '>=0.8'} - cluster-key-slot@1.1.2: resolution: {integrity: sha512-RMr0FhtfXemyinomL4hrWcYJxmX6deFdCxpJzhDttxgO1+bcCnkk+9drydLVDmAMG7NE6aN/fl4F7ucU/90gAA==} engines: {node: '>=0.10.0'} @@ -2165,6 +1622,15 @@ packages: color-name@1.1.4: resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==} + color-string@1.9.1: + resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==} + + color@3.2.1: + resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==} + + colorspace@1.1.4: + resolution: {integrity: sha512-BgvKJiuVu1igBUF2kEjRCZXol6wiiGbY5ipL/oVPwm0BL9sIpMIzM8IK7vwuxIIzOXMV3Ey5w+vxhm0rR/TN8w==} + combined-stream@1.0.8: resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==} engines: {node: '>= 0.8'} @@ -2323,17 +1789,10 @@ packages: resolution: {integrity: sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==} engines: {node: '>=0.10.0'} - defaults@1.0.4: - resolution: {integrity: sha512-eFuaLoy/Rxalv2kr+lqMlUnrDWV+3j4pljOIJgLIhI058IQfWJ7vXhyEIHu+HtC738klGALYxOKDO0bQP3tg8A==} - define-data-property@1.1.4: resolution: {integrity: sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==} engines: {node: '>= 0.4'} - define-lazy-prop@2.0.0: - resolution: {integrity: sha512-Ds09qNh8yw3khSjiJjiUInaGX9xlqZDY7JVryGxdxV7NPeuqQfplOpQ66yJFZut3jLa5zOwkXw1g9EI2uKh4Og==} - engines: {node: '>=8'} - degenerator@5.0.1: resolution: {integrity: sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==} engines: {node: '>= 14'} @@ -2440,6 +1899,9 @@ packages: emoji-regex@9.2.2: resolution: {integrity: sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==} + enabled@2.0.0: + resolution: {integrity: sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ==} + encodeurl@1.0.2: resolution: {integrity: sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==} engines: {node: '>= 0.8'} @@ -2542,9 +2004,6 @@ packages: resolution: {integrity: sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==} engines: {node: '>= 0.10.0'} - extend@3.0.2: - resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==} - extract-zip@2.0.1: resolution: {integrity: sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==} engines: {node: '>= 10.17.0'} @@ -2597,6 +2056,9 @@ packages: resolution: {integrity: sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==} hasBin: true + fn.name@1.1.0: + resolution: {integrity: sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==} + follow-redirects@1.15.6: resolution: {integrity: sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==} engines: {node: '>=4.0'} @@ -2651,14 +2113,6 @@ packages: function-bind@1.1.2: resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} - gaxios@6.7.0: - resolution: {integrity: sha512-DSrkyMTfAnAm4ks9Go20QGOcXEyW/NmZhvTYBU2rb4afBB393WIMQPWPEDMl/k8xqiNN9HYq2zao3oWXsdl2Tg==} - engines: {node: '>=14'} - - gcp-metadata@6.1.0: - resolution: {integrity: sha512-Jh/AIwwgaxan+7ZUUmRLCjtchyDiqh4KjBJ5tW3plBZb5iL/BPcso8A5DlzeD9qlw0duCamnNdpFjxwaT0KyKg==} - engines: {node: '>=14'} - generic-pool@3.9.0: resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==} engines: {node: '>= 4'} @@ -2831,9 +2285,6 @@ packages: import-in-the-middle@1.7.1: resolution: {integrity: sha512-1LrZPDtW+atAxH42S6288qyDFNQ2YCty+2mxEPRtfazH6Z5QwkaBSTS2ods7hnVJioF6rkRfNoA6A/MstpFXLg==} - import-in-the-middle@1.7.4: - resolution: {integrity: sha512-Lk+qzWmiQuRPPulGQeK5qq0v32k2bHnWrRPFgqyvhw7Kkov5L6MOLOIU3pcWeujc9W4q54Cp3Q2WV16eQkc7Bg==} - import-local@3.1.0: resolution: {integrity: sha512-ASB07uLtnDs1o6EHjKpX34BKYDSqnFerfTOJL2HvMqF70LnxpjkzDB8J44oT9pu4AMPkQwf8jl6szgvNd2tRIg==} engines: {node: '>=8'} @@ -2871,6 +2322,9 @@ packages: is-arrayish@0.2.1: resolution: {integrity: sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==} + is-arrayish@0.3.2: + resolution: {integrity: sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==} + is-binary-path@2.1.0: resolution: {integrity: sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==} engines: {node: '>=8'} @@ -2881,11 +2335,6 @@ packages: is-core-module@2.13.1: resolution: {integrity: sha512-hHrIjvZsftOsvKSn2TRYl63zvxsgE0K+0mYMoH6gD4omR5IWB2KynivBQczo3+wF1cCkjzvptnI9Q0sPU66ilw==} - is-docker@2.2.1: - resolution: {integrity: sha512-F+i2BKsFrH66iaUFc0woD8sLy8getkwTwtOBjvs56Cx4CgJDeKQeqfz8wAYiSb8JOprWhHH5p77PbmYCvvUuXQ==} - engines: {node: '>=8'} - hasBin: true - is-extglob@2.1.1: resolution: {integrity: sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==} engines: {node: '>=0.10.0'} @@ -2902,10 +2351,6 @@ packages: resolution: {integrity: sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==} engines: {node: '>=0.10.0'} - is-interactive@1.0.0: - resolution: {integrity: sha512-2HvIEKRoqS62guEC+qBjpvRubdX910WCMuJTZ+I9yvqKU2/12eSL549HMwtabb4oupdj2sMP50k+XJfB/8JE6w==} - engines: {node: '>=8'} - is-number@7.0.0: resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==} engines: {node: '>=0.12.0'} @@ -2922,14 +2367,6 @@ packages: resolution: {integrity: sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==} engines: {node: '>=8'} - is-unicode-supported@0.1.0: - resolution: {integrity: sha512-knxG2q4UC3u8stRGyAVJCOdxFmv5DZiRcdlIaAQXAbSfJya+OhopNotLQrstBhququ4ZpuKbDc/8S6mgXgPFPw==} - engines: {node: '>=10'} - - is-wsl@2.2.0: - resolution: {integrity: sha512-fKzAra0rGJUUBwGBgNkHZuToZcn+TtXHpeCgmkMJMMYx1sQDYaCSyjJBSCa2nH1DGm7s3n1oBnohoVTBaN7Lww==} - engines: {node: '>=8'} - isarray@1.0.0: resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==} @@ -3138,9 +2575,6 @@ packages: engines: {node: '>=4'} hasBin: true - json-bigint@1.0.0: - resolution: {integrity: sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==} - json-parse-even-better-errors@2.3.1: resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==} @@ -3151,9 +2585,6 @@ packages: json-schema-traverse@1.0.0: resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==} - json-stringify-safe@5.0.1: - resolution: {integrity: sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==} - json5@2.2.3: resolution: {integrity: sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==} engines: {node: '>=6'} @@ -3184,6 +2615,9 @@ packages: koffi@2.9.0: resolution: {integrity: sha512-KCsuJ2gM58n6bNdR2Z7gqsh/3TchxxQFbVgax2/UvAjRTgwNSYAJDx9E3jrkBP4jEDHWRCfE47Y2OG+/fiSvEw==} + kuler@2.0.0: + resolution: {integrity: sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==} + langchain@0.2.8: resolution: {integrity: sha512-kb2IOMA71xH8e6EXFg0l4S+QSMC/c796pj1+7mPBkR91HHwoyHZhFRrBaZv4tV+Td+Ba91J2uEDBmySklZLpNQ==} engines: {node: '>=18'} @@ -3378,24 +2812,12 @@ packages: resolution: {integrity: sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==} engines: {node: '>=8'} - lodash.camelcase@4.3.0: - resolution: {integrity: sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==} - lodash.defaults@4.2.0: resolution: {integrity: sha512-qjxPLHd3r5DnsdGacqOMU6pb/avJzdh9tFX2ymgoZE27BmjXrNy/y4LoaiTeAb+O3gL8AfpJGtqfX/ae2leYYQ==} lodash.isarguments@3.1.0: resolution: {integrity: sha512-chi4NHZlZqZD18a0imDHnZPrDeBbTtVN7GXMwuGdRH9qotxAjYs3aVLKc7zNOG9eddR5Ksd8rvFEBc9SsggPpg==} - lodash.isobject@3.0.2: - resolution: {integrity: sha512-3/Qptq2vr7WeJbB4KHUSKlq8Pl7ASXi3UG6CMbBm8WRtXi8+GHm7mKaU3urfpSEzWe2wCIChs6/sdocUsTKJiA==} - - lodash.isplainobject@4.0.6: - resolution: {integrity: sha512-oSXzaWypCMHkPC3NvBEaPHf0KsA5mvPrOPgQWDsbg8n7orZ290M0BmC/jgRZ4vcJ6DTAhjrsSYgdsW/F+MFOBA==} - - lodash.isstring@4.0.1: - resolution: {integrity: sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==} - lodash.memoize@4.1.2: resolution: {integrity: sha512-t7j+NzmgnQzTAYXcsHYLgimltOV1MXHtlOWf6GjL9Kj8GK5FInw5JotxvbOs+IvV1/Dzo04/fCGfLVs7aXb4Ag==} @@ -3408,14 +2830,14 @@ packages: lodash@4.17.21: resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==} - log-symbols@4.1.0: - resolution: {integrity: sha512-8XPvpAA8uyhfteu8pIvQxpJZ7SYYdpUivZpGy6sFsBuKRY/7rQGavedeB8aK+Zkyq6upMFVL/9AW6vOYzfRyLg==} - engines: {node: '>=10'} - logform@2.6.0: resolution: {integrity: sha512-1ulHeNPp6k/LD8H91o7VYFBng5i1BDE7HoKxVbZiGFidS1Rj65qcywLxX+pVfAPoQJEjRdvKcusKwOupHCVOVQ==} engines: {node: '>= 12.0.0'} + logform@2.6.1: + resolution: {integrity: sha512-CdaO738xRapbKIMVn2m4F6KTj4j7ooJ8POVnebSgKo3KBz5axNXRAL7ZdRjIV6NOr2Uf4vjtRkxrFETOioCqSA==} + engines: {node: '>= 12.0.0'} + loglevel@1.9.1: resolution: {integrity: sha512-hP3I3kCrDIMuRwAwHltphhDM1r8i55H33GgqjXbrisuJhF4kRhW1dNuxsRklp4bXl8DSdLaNLuiL4A/LWRfxvg==} engines: {node: '>= 0.6.0'} @@ -3423,9 +2845,6 @@ packages: logsnag@1.0.0: resolution: {integrity: sha512-HMzjh75OR5EVY7Be4Rw8TcDTIY5UPsrXF1HvQ6EzDi21x5cQcDzi4Ts0Y/ruPCbxKY2KG17YjeeTzErXFewFBg==} - long@5.2.3: - resolution: {integrity: sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==} - loose-envify@1.4.0: resolution: {integrity: sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==} hasBin: true @@ -3467,6 +2886,11 @@ packages: engines: {node: '>=12.0.0'} hasBin: true + marked@14.1.2: + resolution: {integrity: sha512-f3r0yqpz31VXiDB/wj9GaOB0a2PRLQl6vJmXiFrniNwjkKdvakqJRULhjFKJpxOchlCRiG5fcacoUZY5Xa6PEQ==} + engines: {node: '>= 18'} + hasBin: true + md5@2.3.0: resolution: {integrity: sha512-T1GITYmFaKuO91vxyoQMFETst+O71VUPEU3ze5GNzDm0OWdP8v1ziTaAEPUr/3kLsY3Sftgz242A1SetQiDL7g==} @@ -3736,14 +3160,13 @@ packages: once@1.4.0: resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==} + one-time@1.0.0: + resolution: {integrity: sha512-5DXOiRKwuSEcQ/l0kGCF6Q3jcADFv5tSmRaJck/OqkVFcOzutB134KRSfF0xDrL39MNnqxbHBbUUcjZIhTgb2g==} + onetime@5.1.2: resolution: {integrity: sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==} engines: {node: '>=6'} - open@8.4.2: - resolution: {integrity: sha512-7x81NCL719oNbsq/3mh+hVrAWmFuEYUqrq/Iw3kUzH8ReypT9QQ0BLoJS7/G9k6N81XjW4qHWtjWwe/9eLy1EQ==} - engines: {node: '>=12'} - openai@3.3.0: resolution: {integrity: sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ==} @@ -3772,10 +3195,6 @@ packages: option@0.2.4: resolution: {integrity: sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==} - ora@5.4.1: - resolution: {integrity: sha512-5b6Y85tPxZZ7QytO+BQzysW31HJku27cRIlkbAXaNx+BdcVi+LlRFmVXzeF6a7JCwJpyw5c4b+YSVImQIrBpuQ==} - engines: {node: '>=10'} - p-finally@1.0.0: resolution: {integrity: sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==} engines: {node: '>=4'} @@ -3919,9 +3338,6 @@ packages: resolution: {integrity: sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==} engines: {node: '>=8.6'} - pino-abstract-transport@1.2.0: - resolution: {integrity: sha512-Guhh8EZfPCfH+PMXAb6rKOjGQEoy0xlAIn+irODG5kgfYV+BQ0rGYYWTIel3P5mmyXqkYkPmdIkywsn6QKUR1Q==} - pirates@4.0.6: resolution: {integrity: sha512-saLsH7WeYYPiD25LDuLRRY/i+6HaPYr6G1OUlN39otzkSTxKnubR9RTxS3/Kk50s1g2JTgFwWQDQyplC5/SHZg==} engines: {node: '>= 6'} @@ -3986,10 +3402,6 @@ packages: proto-list@1.2.4: resolution: {integrity: sha512-vtK/94akxsTMhe0/cbfpR+syPuszcuwhqVjJq26CuNDgFGj682oRBXOP5MJpv2r7JtE8MsiepGIqvvOTBwn2vA==} - protobufjs@7.3.2: - resolution: {integrity: sha512-RXyHaACeqXeqAKGLDl68rQKbmObRsTIn4TYVUUug1KfS47YWCo5MacGITEryugIgZqORCvJWEk4l449POg5Txg==} - engines: {node: '>=12.0.0'} - proxy-addr@2.0.7: resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==} engines: {node: '>= 0.10'} @@ -4139,10 +3551,6 @@ packages: resolution: {integrity: sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw==} hasBin: true - restore-cursor@3.1.0: - resolution: {integrity: sha512-l+sSefzHpj5qimhFSE5a8nufZYAM3sBSVMAPtYkmC+4EH2anSGaEMXSD0izRQbu9nfyQ9y5JrVmp7E8oZrUjvA==} - engines: {node: '>=8'} - retry@0.13.1: resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} engines: {node: '>= 4'} @@ -4213,10 +3621,6 @@ packages: resolution: {integrity: sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==} engines: {node: '>= 0.8.0'} - serialize-error@8.1.0: - resolution: {integrity: sha512-3NnuWfM6vBYoy5gZFvHiYsVbafvI9vZv/+jlIigFn4oP4zjNPK3LhcY0xSCgeb1a5L8jO71Mit9LlNoi2UfDDQ==} - engines: {node: '>=10'} - serve-static@1.15.0: resolution: {integrity: sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==} engines: {node: '>= 0.8.0'} @@ -4259,6 +3663,9 @@ packages: resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==} engines: {node: '>=14'} + simple-swizzle@0.2.2: + resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} + simple-update-notifier@1.1.0: resolution: {integrity: sha512-VpsrsJSUcJEseSbMHkrsrAVSdvVS5I96Qo1QAQ4FxQ9wXFcB+pjj7FB7/us9+GcgfW4ziHtYMc1J0PLczb55mg==} engines: {node: '>=8.10.0'} @@ -4422,6 +3829,9 @@ packages: text-decoder@1.1.0: resolution: {integrity: sha512-TmLJNj6UgX8xcUZo4UDStGQtDiTzF7BzWlzn9g7UWrjkpHr5uJTK1ld16wZ3LXb2vb6jH8qU89dW5whuMdXYdw==} + text-hex@1.0.0: + resolution: {integrity: sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==} + through@2.3.8: resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==} @@ -4509,10 +3919,6 @@ packages: resolution: {integrity: sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==} engines: {node: '>=4'} - type-fest@0.20.2: - resolution: {integrity: sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==} - engines: {node: '>=10'} - type-fest@0.21.3: resolution: {integrity: sha512-t0rzBq87m3fVcduHDUFhKmyyX+9eo6WQjZvf51Ea/M0Q7+T374Jp1aUiyUl0GKxp8M/OETVHSDvmkyPgvX+X2w==} engines: {node: '>=10'} @@ -4615,9 +4021,6 @@ packages: walker@1.0.8: resolution: {integrity: sha512-ts/8E8l5b7kY0vlWLewOkDXMmPdLcVV4GmOQLyxuSswIJsweeFZtAsMF7k1Nszz+TYBQrlYRmzOnr398y1JemQ==} - wcwidth@1.0.1: - resolution: {integrity: sha512-XHPEwS0q6TaxcvG85+8EYkbiCux2XtWG2mkc47Ng2A77BQu9+DqIOJldST4HgPkuea7dvKSj5VgX3P1d4rW8Tg==} - web-streams-polyfill@3.3.3: resolution: {integrity: sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==} engines: {node: '>= 8'} @@ -4652,8 +4055,12 @@ packages: engines: {node: '>= 8'} hasBin: true - winston-transport@4.7.0: - resolution: {integrity: sha512-ajBj65K5I7denzer2IYW6+2bNIVqLGDHqDw3Ow8Ohh+vdW+rv4MZ6eiDvHoKhfJFZ2auyN8byXieDDJ96ViONg==} + winston-transport@4.8.0: + resolution: {integrity: sha512-qxSTKswC6llEMZKgCQdaWgDuMJQnhuvF5f2Nk3SNXc4byfQ+voo2mX1Px9dkNOuR8p0KAjfPG29PuYUSIb+vSA==} + engines: {node: '>= 12.0.0'} + + winston@3.14.2: + resolution: {integrity: sha512-CO8cdpBB2yqzEf8v895L+GNKYJiEq8eKlHU38af3snQBQ+sdAIUepjMSguOIJC7ICbzm0ZI+Af2If4vIJrtmOg==} engines: {node: '>= 12.0.0'} wordnet-db@3.1.14: @@ -5038,9 +4445,15 @@ snapshots: dependencies: '@jridgewell/trace-mapping': 0.3.9 + '@dabh/diagnostics@2.0.3': + dependencies: + colorspace: 1.1.4 + enabled: 2.0.0 + kuler: 2.0.0 + '@devil7softwares/pos@1.0.2': {} - '@dqbd/tiktoken@1.0.15': {} + '@dqbd/tiktoken@1.0.16': {} '@flydotio/dockerfile@0.4.11': dependencies: @@ -5050,83 +4463,6 @@ snapshots: shell-quote: 1.8.1 yargs: 17.7.2 - '@grpc/grpc-js@1.10.10': - dependencies: - '@grpc/proto-loader': 0.7.13 - '@js-sdsl/ordered-map': 4.4.2 - - '@grpc/proto-loader@0.7.13': - dependencies: - lodash.camelcase: 4.3.0 - long: 5.2.3 - protobufjs: 7.3.2 - yargs: 17.7.2 - - '@hyperdx/instrumentation-exception@0.1.0(@opentelemetry/api@1.9.0)': - dependencies: - '@hyperdx/instrumentation-sentry-node': 0.1.0(@opentelemetry/api@1.9.0) - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - '@sentry/core': 8.13.0 - '@sentry/types': 8.13.0 - '@sentry/utils': 8.13.0 - json-stringify-safe: 5.0.1 - shimmer: 1.2.1 - tslib: 2.6.3 - transitivePeerDependencies: - - supports-color - - '@hyperdx/instrumentation-sentry-node@0.1.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - json-stringify-safe: 5.0.1 - shimmer: 1.2.1 - tslib: 2.6.3 - transitivePeerDependencies: - - supports-color - - '@hyperdx/node-opentelemetry@0.8.1': - dependencies: - '@hyperdx/instrumentation-exception': 0.1.0(@opentelemetry/api@1.9.0) - '@hyperdx/instrumentation-sentry-node': 0.1.0(@opentelemetry/api@1.9.0) - '@opentelemetry/api': 1.9.0 - '@opentelemetry/api-logs': 0.51.1 - '@opentelemetry/auto-instrumentations-node': 0.46.1(@opentelemetry/api@1.9.0) - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/exporter-logs-otlp-http': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/exporter-metrics-otlp-proto': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/exporter-trace-otlp-proto': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-http': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-runtime-node': 0.4.0(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-logs': 0.51.1(@opentelemetry/api-logs@0.51.1)(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-metrics': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-node': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-trace-base': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - cli-spinners: 2.9.2 - json-stringify-safe: 5.0.1 - lodash.isobject: 3.0.2 - lodash.isplainobject: 4.0.6 - lodash.isstring: 4.0.1 - node-fetch: 2.7.0 - open: 8.4.2 - ora: 5.4.1 - pino-abstract-transport: 1.2.0 - semver: 7.6.2 - shimmer: 1.2.1 - tslib: 2.6.3 - winston-transport: 4.7.0 - transitivePeerDependencies: - - encoding - - supports-color - '@ioredis/commands@1.2.0': {} '@isaacs/cliui@8.0.2': @@ -5336,8 +4672,6 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.4.15 - '@js-sdsl/ordered-map@4.4.2': {} - '@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8))': dependencies: ansi-styles: 5.2.0 @@ -5375,39 +4709,12 @@ snapshots: - langchain - openai - '@logtail/core@0.4.21': - dependencies: - '@logtail/tools': 0.4.21 - '@logtail/types': 0.4.20 - serialize-error: 8.1.0 - - '@logtail/node@0.4.21': - dependencies: - '@logtail/core': 0.4.21 - '@logtail/types': 0.4.20 - '@msgpack/msgpack': 2.8.0 - '@types/stack-trace': 0.0.29 - cross-fetch: 3.1.8 - minimatch: 3.1.2 - serialize-error: 8.1.0 - stack-trace: 0.0.10 - transitivePeerDependencies: - - encoding - - '@logtail/tools@0.4.21': - dependencies: - '@logtail/types': 0.4.20 - - '@logtail/types@0.4.20': {} - '@mixmark-io/domino@2.2.0': {} '@mongodb-js/saslprep@1.1.7': dependencies: sparse-bitfield: 3.0.3 - '@msgpack/msgpack@2.8.0': {} - '@msgpackr-extract/msgpackr-extract-darwin-arm64@3.0.3': optional: true @@ -5434,210 +4741,21 @@ snapshots: '@one-ini/wasm@0.1.1': {} - '@opentelemetry/api-logs@0.51.1': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/api-logs@0.52.1': dependencies: '@opentelemetry/api': 1.9.0 '@opentelemetry/api@1.9.0': {} - '@opentelemetry/auto-instrumentations-node@0.46.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-amqplib': 0.37.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-aws-lambda': 0.41.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-aws-sdk': 0.41.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-bunyan': 0.38.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-cassandra-driver': 0.38.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-connect': 0.36.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-cucumber': 0.6.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-dataloader': 0.9.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-dns': 0.36.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-express': 0.39.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-fastify': 0.36.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-fs': 0.12.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-generic-pool': 0.36.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-graphql': 0.40.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-grpc': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-hapi': 0.38.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-http': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-ioredis': 0.40.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-knex': 0.36.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-koa': 0.40.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-lru-memoizer': 0.37.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-memcached': 0.36.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-mongodb': 0.43.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-mongoose': 0.38.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-mysql': 0.38.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-mysql2': 0.38.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-nestjs-core': 0.37.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-net': 0.36.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-pg': 0.41.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-pino': 0.39.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-redis': 0.39.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-redis-4': 0.39.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-restify': 0.38.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-router': 0.37.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-socket.io': 0.39.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-tedious': 0.10.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-undici': 0.2.0(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation-winston': 0.37.0(@opentelemetry/api@1.9.0) - '@opentelemetry/resource-detector-alibaba-cloud': 0.28.10(@opentelemetry/api@1.9.0) - '@opentelemetry/resource-detector-aws': 1.5.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resource-detector-azure': 0.2.9(@opentelemetry/api@1.9.0) - '@opentelemetry/resource-detector-container': 0.3.11(@opentelemetry/api@1.9.0) - '@opentelemetry/resource-detector-gcp': 0.29.10(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-node': 0.51.1(@opentelemetry/api@1.9.0) - transitivePeerDependencies: - - encoding - - supports-color - - '@opentelemetry/context-async-hooks@1.24.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/context-async-hooks@1.25.1(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 - '@opentelemetry/core@1.24.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/semantic-conventions': 1.24.1 - '@opentelemetry/core@1.25.1(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 '@opentelemetry/semantic-conventions': 1.25.1 - '@opentelemetry/exporter-logs-otlp-http@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/api-logs': 0.51.1 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-exporter-base': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-transformer': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-logs': 0.51.1(@opentelemetry/api-logs@0.51.1)(@opentelemetry/api@1.9.0) - - '@opentelemetry/exporter-metrics-otlp-http@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-exporter-base': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-transformer': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-metrics': 1.24.1(@opentelemetry/api@1.9.0) - - '@opentelemetry/exporter-metrics-otlp-proto@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/exporter-metrics-otlp-http': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-exporter-base': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-proto-exporter-base': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-transformer': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-metrics': 1.24.1(@opentelemetry/api@1.9.0) - - '@opentelemetry/exporter-trace-otlp-grpc@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@grpc/grpc-js': 1.10.10 - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-grpc-exporter-base': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-transformer': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-trace-base': 1.24.1(@opentelemetry/api@1.9.0) - - '@opentelemetry/exporter-trace-otlp-http@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-exporter-base': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-transformer': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-trace-base': 1.24.1(@opentelemetry/api@1.9.0) - - '@opentelemetry/exporter-trace-otlp-proto@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-exporter-base': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-proto-exporter-base': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-transformer': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-trace-base': 1.24.1(@opentelemetry/api@1.9.0) - - '@opentelemetry/exporter-zipkin@1.24.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-trace-base': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.24.1 - - '@opentelemetry/instrumentation-amqplib@0.37.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-aws-lambda@0.41.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/propagator-aws-xray': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - '@types/aws-lambda': 8.10.122 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-aws-sdk@0.41.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/propagation-utils': 0.30.10(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-bunyan@0.38.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/api-logs': 0.51.1 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@types/bunyan': 1.8.9 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-cassandra-driver@0.38.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-connect@0.36.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - '@types/connect': 3.4.36 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-connect@0.38.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5648,39 +4766,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-cucumber@0.6.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-dataloader@0.9.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-dns@0.36.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - semver: 7.6.2 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-express@0.39.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-express@0.41.1(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5690,15 +4775,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-fastify@0.36.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-fastify@0.38.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5708,14 +4784,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-fs@0.12.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-fs@0.14.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5724,21 +4792,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-generic-pool@0.36.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-graphql@0.40.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-graphql@0.42.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5746,23 +4799,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-grpc@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.24.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-hapi@0.38.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-hapi@0.40.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5772,16 +4808,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-http@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.24.1 - semver: 7.6.2 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-http@0.52.1(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5792,15 +4818,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-ioredis@0.40.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/redis-common': 0.36.2 - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-ioredis@0.42.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5810,25 +4827,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-knex@0.36.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-koa@0.40.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - '@types/koa': 2.14.0 - '@types/koa__router': 12.0.3 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-koa@0.42.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5838,31 +4836,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-lru-memoizer@0.37.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-memcached@0.36.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - '@types/memcached': 2.2.10 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-mongodb@0.43.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-metrics': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-mongodb@0.46.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5872,15 +4845,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-mongoose@0.38.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-mongoose@0.40.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5890,15 +4854,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-mysql2@0.38.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - '@opentelemetry/sql-common': 0.40.1(@opentelemetry/api@1.9.0) - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-mysql2@0.40.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5908,15 +4863,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-mysql@0.38.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - '@types/mysql': 2.15.22 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-mysql@0.40.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5926,14 +4872,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-nestjs-core@0.37.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-nestjs-core@0.39.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5942,25 +4880,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-net@0.36.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-pg@0.41.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - '@opentelemetry/sql-common': 0.40.1(@opentelemetry/api@1.9.0) - '@types/pg': 8.6.1 - '@types/pg-pool': 2.0.4 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-pg@0.43.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5972,22 +4891,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-pino@0.39.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-redis-4@0.39.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/redis-common': 0.36.2 - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation-redis-4@0.41.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -5997,72 +4900,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/instrumentation-redis@0.39.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/redis-common': 0.36.2 - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-restify@0.38.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-router@0.37.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-runtime-node@0.4.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-socket.io@0.39.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-tedious@0.10.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - '@types/tedious': 4.0.14 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-undici@0.2.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - transitivePeerDependencies: - - supports-color - - '@opentelemetry/instrumentation-winston@0.37.0(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/api-logs': 0.51.1 - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation@0.46.0(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -6075,18 +4912,6 @@ snapshots: - supports-color optional: true - '@opentelemetry/instrumentation@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/api-logs': 0.51.1 - '@types/shimmer': 1.0.5 - import-in-the-middle: 1.7.4 - require-in-the-middle: 7.3.0 - semver: 7.6.2 - shimmer: 1.2.1 - transitivePeerDependencies: - - supports-color - '@opentelemetry/instrumentation@0.52.1(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -6099,119 +4924,14 @@ snapshots: transitivePeerDependencies: - supports-color - '@opentelemetry/otlp-exporter-base@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - - '@opentelemetry/otlp-grpc-exporter-base@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@grpc/grpc-js': 1.10.10 - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-exporter-base': 0.51.1(@opentelemetry/api@1.9.0) - protobufjs: 7.3.2 - - '@opentelemetry/otlp-proto-exporter-base@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/otlp-exporter-base': 0.51.1(@opentelemetry/api@1.9.0) - protobufjs: 7.3.2 - - '@opentelemetry/otlp-transformer@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/api-logs': 0.51.1 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-logs': 0.51.1(@opentelemetry/api-logs@0.51.1)(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-metrics': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-trace-base': 1.24.1(@opentelemetry/api@1.9.0) - - '@opentelemetry/propagation-utils@0.30.10(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - - '@opentelemetry/propagator-aws-xray@1.25.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - - '@opentelemetry/propagator-b3@1.24.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - - '@opentelemetry/propagator-jaeger@1.24.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/redis-common@0.36.2': {} - '@opentelemetry/resource-detector-alibaba-cloud@0.28.10(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - - '@opentelemetry/resource-detector-aws@1.5.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - - '@opentelemetry/resource-detector-azure@0.2.9(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - - '@opentelemetry/resource-detector-container@0.3.11(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - - '@opentelemetry/resource-detector-gcp@0.29.10(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.25.1 - gcp-metadata: 6.1.0 - transitivePeerDependencies: - - encoding - - supports-color - - '@opentelemetry/resources@1.24.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.24.1 - '@opentelemetry/resources@1.25.1(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) '@opentelemetry/semantic-conventions': 1.25.1 - '@opentelemetry/sdk-logs@0.51.1(@opentelemetry/api-logs@0.51.1)(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/api-logs': 0.51.1 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - - '@opentelemetry/sdk-metrics@1.24.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - lodash.merge: 4.6.2 - '@opentelemetry/sdk-metrics@1.25.1(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -6219,32 +4939,6 @@ snapshots: '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) lodash.merge: 4.6.2 - '@opentelemetry/sdk-node@0.51.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/api-logs': 0.51.1 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/exporter-trace-otlp-grpc': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/exporter-trace-otlp-http': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/exporter-trace-otlp-proto': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/exporter-zipkin': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/instrumentation': 0.51.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-logs': 0.51.1(@opentelemetry/api-logs@0.51.1)(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-metrics': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-trace-base': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-trace-node': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.24.1 - transitivePeerDependencies: - - supports-color - - '@opentelemetry/sdk-trace-base@1.24.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/resources': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/semantic-conventions': 1.24.1 - '@opentelemetry/sdk-trace-base@1.25.1(@opentelemetry/api@1.9.0)': dependencies: '@opentelemetry/api': 1.9.0 @@ -6252,18 +4946,6 @@ snapshots: '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) '@opentelemetry/semantic-conventions': 1.25.1 - '@opentelemetry/sdk-trace-node@1.24.1(@opentelemetry/api@1.9.0)': - dependencies: - '@opentelemetry/api': 1.9.0 - '@opentelemetry/context-async-hooks': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/core': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/propagator-b3': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/propagator-jaeger': 1.24.1(@opentelemetry/api@1.9.0) - '@opentelemetry/sdk-trace-base': 1.24.1(@opentelemetry/api@1.9.0) - semver: 7.6.2 - - '@opentelemetry/semantic-conventions@1.24.1': {} - '@opentelemetry/semantic-conventions@1.25.1': {} '@opentelemetry/sql-common@0.40.1(@opentelemetry/api@1.9.0)': @@ -6290,29 +4972,6 @@ snapshots: transitivePeerDependencies: - supports-color - '@protobufjs/aspromise@1.1.2': {} - - '@protobufjs/base64@1.1.2': {} - - '@protobufjs/codegen@2.0.4': {} - - '@protobufjs/eventemitter@1.1.0': {} - - '@protobufjs/fetch@1.1.0': - dependencies: - '@protobufjs/aspromise': 1.1.2 - '@protobufjs/inquire': 1.1.0 - - '@protobufjs/float@1.0.2': {} - - '@protobufjs/inquire@1.1.0': {} - - '@protobufjs/path@1.1.2': {} - - '@protobufjs/pool@1.1.0': {} - - '@protobufjs/utf8@1.1.0': {} - '@puppeteer/browsers@2.2.3': dependencies: debug: 4.3.4 @@ -6405,11 +5064,6 @@ snapshots: - encoding - supports-color - '@sentry/core@8.13.0': - dependencies: - '@sentry/types': 8.13.0 - '@sentry/utils': 8.13.0 - '@sentry/core@8.26.0': dependencies: '@sentry/types': 8.26.0 @@ -6473,14 +5127,8 @@ snapshots: transitivePeerDependencies: - supports-color - '@sentry/types@8.13.0': {} - '@sentry/types@8.26.0': {} - '@sentry/utils@8.13.0': - dependencies: - '@sentry/types': 8.13.0 - '@sentry/utils@8.26.0': dependencies: '@sentry/types': 8.26.0 @@ -6631,12 +5279,6 @@ snapshots: '@tsconfig/recommended@1.0.6': {} - '@types/accepts@1.3.7': - dependencies: - '@types/node': 20.14.1 - - '@types/aws-lambda@8.10.122': {} - '@types/babel__core@7.20.5': dependencies: '@babel/parser': 7.24.6 @@ -6663,10 +5305,6 @@ snapshots: '@types/connect': 3.4.38 '@types/node': 20.14.1 - '@types/bunyan@1.8.9': - dependencies: - '@types/node': 20.14.1 - '@types/connect@3.4.36': dependencies: '@types/node': 20.14.1 @@ -6675,19 +5313,12 @@ snapshots: dependencies: '@types/node': 20.14.1 - '@types/content-disposition@0.5.8': {} - - '@types/cookies@0.9.0': - dependencies: - '@types/connect': 3.4.38 - '@types/express': 4.17.21 - '@types/keygrip': 1.0.6 - '@types/node': 20.14.1 - '@types/cors@2.8.17': dependencies: '@types/node': 20.14.1 + '@types/escape-html@1.0.4': {} + '@types/express-serve-static-core@4.19.3': dependencies: '@types/node': 20.14.1 @@ -6712,8 +5343,6 @@ snapshots: dependencies: '@types/node': 20.14.1 - '@types/http-assert@1.5.5': {} - '@types/http-errors@2.0.4': {} '@types/istanbul-lib-coverage@2.0.6': {} @@ -6731,31 +5360,6 @@ snapshots: expect: 29.7.0 pretty-format: 29.7.0 - '@types/keygrip@1.0.6': {} - - '@types/koa-compose@3.2.8': - dependencies: - '@types/koa': 2.14.0 - - '@types/koa@2.14.0': - dependencies: - '@types/accepts': 1.3.7 - '@types/content-disposition': 0.5.8 - '@types/cookies': 0.9.0 - '@types/http-assert': 1.5.5 - '@types/http-errors': 2.0.4 - '@types/keygrip': 1.0.6 - '@types/koa-compose': 3.2.8 - '@types/node': 20.14.1 - - '@types/koa__router@12.0.3': - dependencies: - '@types/koa': 2.14.0 - - '@types/memcached@2.2.10': - dependencies: - '@types/node': 20.14.1 - '@types/mime@1.3.5': {} '@types/mysql@2.15.22': @@ -6775,6 +5379,8 @@ snapshots: dependencies: undici-types: 5.26.5 + '@types/pdf-parse@1.1.4': {} + '@types/pg-pool@2.0.4': dependencies: '@types/pg': 8.6.1 @@ -6806,14 +5412,8 @@ snapshots: '@types/shimmer@1.0.5': {} - '@types/stack-trace@0.0.29': {} - '@types/stack-utils@2.0.3': {} - '@types/tedious@4.0.14': - dependencies: - '@types/node': 20.14.1 - '@types/triple-beam@1.3.5': {} '@types/uuid@9.0.8': {} @@ -7064,8 +5664,6 @@ snapshots: basic-ftp@5.0.5: {} - bignumber.js@9.1.2: {} - bin-links@4.0.4: dependencies: cmd-shim: 6.0.3 @@ -7077,12 +5675,6 @@ snapshots: binary-search@1.3.6: {} - bl@4.1.0: - dependencies: - buffer: 5.7.1 - inherits: 2.0.4 - readable-stream: 3.6.2 - bluebird@3.4.7: {} body-parser@1.20.2: @@ -7243,20 +5835,12 @@ snapshots: cjs-module-lexer@1.3.1: {} - cli-cursor@3.1.0: - dependencies: - restore-cursor: 3.1.0 - - cli-spinners@2.9.2: {} - cliui@8.0.1: dependencies: string-width: 4.2.3 strip-ansi: 6.0.1 wrap-ansi: 7.0.0 - clone@1.0.4: {} - cluster-key-slot@1.1.2: {} cmd-shim@6.0.3: {} @@ -7279,6 +5863,21 @@ snapshots: color-name@1.1.4: {} + color-string@1.9.1: + dependencies: + color-name: 1.1.4 + simple-swizzle: 0.2.2 + + color@3.2.1: + dependencies: + color-convert: 1.9.3 + color-string: 1.9.1 + + colorspace@1.1.4: + dependencies: + color: 3.2.1 + text-hex: 1.0.0 + combined-stream@1.0.8: dependencies: delayed-stream: 1.0.0 @@ -7407,18 +6006,12 @@ snapshots: deepmerge@4.3.1: {} - defaults@1.0.4: - dependencies: - clone: 1.0.4 - define-data-property@1.1.4: dependencies: es-define-property: 1.0.0 es-errors: 1.3.0 gopd: 1.0.1 - define-lazy-prop@2.0.0: {} - degenerator@5.0.1: dependencies: ast-types: 0.13.4 @@ -7508,6 +6101,8 @@ snapshots: emoji-regex@9.2.2: {} + enabled@2.0.0: {} + encodeurl@1.0.2: {} end-of-stream@1.4.4: @@ -7628,8 +6223,6 @@ snapshots: transitivePeerDependencies: - supports-color - extend@3.0.2: {} - extract-zip@2.0.1: dependencies: debug: 4.3.4 @@ -7692,6 +6285,8 @@ snapshots: flat@5.0.2: {} + fn.name@1.1.0: {} + follow-redirects@1.15.6: {} foreground-child@3.2.1: @@ -7740,25 +6335,6 @@ snapshots: function-bind@1.1.2: {} - gaxios@6.7.0: - dependencies: - extend: 3.0.2 - https-proxy-agent: 7.0.5 - is-stream: 2.0.1 - node-fetch: 2.7.0 - uuid: 10.0.0 - transitivePeerDependencies: - - encoding - - supports-color - - gcp-metadata@6.1.0: - dependencies: - gaxios: 6.7.0 - json-bigint: 1.0.0 - transitivePeerDependencies: - - encoding - - supports-color - generic-pool@3.9.0: {} gensync@1.0.0-beta.2: {} @@ -7978,13 +6554,6 @@ snapshots: module-details-from-path: 1.0.3 optional: true - import-in-the-middle@1.7.4: - dependencies: - acorn: 8.12.0 - acorn-import-attributes: 1.9.5(acorn@8.12.0) - cjs-module-lexer: 1.3.1 - module-details-from-path: 1.0.3 - import-local@3.1.0: dependencies: pkg-dir: 4.2.0 @@ -8026,6 +6595,8 @@ snapshots: is-arrayish@0.2.1: {} + is-arrayish@0.3.2: {} + is-binary-path@2.1.0: dependencies: binary-extensions: 2.3.0 @@ -8036,8 +6607,6 @@ snapshots: dependencies: hasown: 2.0.2 - is-docker@2.2.1: {} - is-extglob@2.1.1: {} is-fullwidth-code-point@3.0.0: {} @@ -8048,8 +6617,6 @@ snapshots: dependencies: is-extglob: 2.1.1 - is-interactive@1.0.0: {} - is-number@7.0.0: {} is-plain-obj@2.1.0: {} @@ -8058,12 +6625,6 @@ snapshots: is-stream@2.0.1: {} - is-unicode-supported@0.1.0: {} - - is-wsl@2.2.0: - dependencies: - is-docker: 2.2.1 - isarray@1.0.0: {} isexe@2.0.0: {} @@ -8476,18 +7037,12 @@ snapshots: jsesc@2.5.2: {} - json-bigint@1.0.0: - dependencies: - bignumber.js: 9.1.2 - json-parse-even-better-errors@2.3.1: {} json-schema-to-zod@2.3.0: {} json-schema-traverse@1.0.0: {} - json-stringify-safe@5.0.1: {} - json5@2.2.3: {} jsonfile@6.1.0: @@ -8513,6 +7068,8 @@ snapshots: koffi@2.9.0: {} + kuler@2.0.0: {} + langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0): dependencies: '@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.57.0(zod@3.23.8))(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.57.0(zod@3.23.8)) @@ -8579,18 +7136,10 @@ snapshots: dependencies: p-locate: 4.1.0 - lodash.camelcase@4.3.0: {} - lodash.defaults@4.2.0: {} lodash.isarguments@3.1.0: {} - lodash.isobject@3.0.2: {} - - lodash.isplainobject@4.0.6: {} - - lodash.isstring@4.0.1: {} - lodash.memoize@4.1.2: {} lodash.merge@4.6.2: {} @@ -8599,11 +7148,6 @@ snapshots: lodash@4.17.21: {} - log-symbols@4.1.0: - dependencies: - chalk: 4.1.2 - is-unicode-supported: 0.1.0 - logform@2.6.0: dependencies: '@colors/colors': 1.6.0 @@ -8613,6 +7157,15 @@ snapshots: safe-stable-stringify: 2.4.3 triple-beam: 1.4.1 + logform@2.6.1: + dependencies: + '@colors/colors': 1.6.0 + '@types/triple-beam': 1.3.5 + fecha: 4.2.3 + ms: 2.1.3 + safe-stable-stringify: 2.4.3 + triple-beam: 1.4.1 + loglevel@1.9.1: {} logsnag@1.0.0: @@ -8621,8 +7174,6 @@ snapshots: transitivePeerDependencies: - encoding - long@5.2.3: {} - loose-envify@1.4.0: dependencies: js-tokens: 4.0.0 @@ -8670,6 +7221,8 @@ snapshots: underscore: 1.13.6 xmlbuilder: 10.1.1 + marked@14.1.2: {} + md5@2.3.0: dependencies: charenc: 0.0.2 @@ -8937,16 +7490,14 @@ snapshots: dependencies: wrappy: 1.0.2 + one-time@1.0.0: + dependencies: + fn.name: 1.1.0 + onetime@5.1.2: dependencies: mimic-fn: 2.1.0 - open@8.4.2: - dependencies: - define-lazy-prop: 2.0.0 - is-docker: 2.2.1 - is-wsl: 2.2.0 - openai@3.3.0: dependencies: axios: 0.26.1 @@ -8985,18 +7536,6 @@ snapshots: option@0.2.4: {} - ora@5.4.1: - dependencies: - bl: 4.1.0 - chalk: 4.1.2 - cli-cursor: 3.1.0 - cli-spinners: 2.9.2 - is-interactive: 1.0.0 - is-unicode-supported: 0.1.0 - log-symbols: 4.1.0 - strip-ansi: 6.0.1 - wcwidth: 1.0.1 - p-finally@1.0.0: {} p-limit@2.3.0: @@ -9148,11 +7687,6 @@ snapshots: picomatch@2.3.1: {} - pino-abstract-transport@1.2.0: - dependencies: - readable-stream: 4.5.2 - split2: 4.2.0 - pirates@4.0.6: {} pkg-dir@4.2.0: @@ -9221,21 +7755,6 @@ snapshots: proto-list@1.2.4: {} - protobufjs@7.3.2: - dependencies: - '@protobufjs/aspromise': 1.1.2 - '@protobufjs/base64': 1.1.2 - '@protobufjs/codegen': 2.0.4 - '@protobufjs/eventemitter': 1.1.0 - '@protobufjs/fetch': 1.1.0 - '@protobufjs/float': 1.0.2 - '@protobufjs/inquire': 1.1.0 - '@protobufjs/path': 1.1.2 - '@protobufjs/pool': 1.1.0 - '@protobufjs/utf8': 1.1.0 - '@types/node': 20.14.1 - long: 5.2.3 - proxy-addr@2.0.7: dependencies: forwarded: 0.2.0 @@ -9417,11 +7936,6 @@ snapshots: path-parse: 1.0.7 supports-preserve-symlinks-flag: 1.0.0 - restore-cursor@3.1.0: - dependencies: - onetime: 5.1.2 - signal-exit: 3.0.7 - retry@0.13.1: {} rimraf@5.0.7: @@ -9490,10 +8004,6 @@ snapshots: transitivePeerDependencies: - supports-color - serialize-error@8.1.0: - dependencies: - type-fest: 0.20.2 - serve-static@1.15.0: dependencies: encodeurl: 1.0.2 @@ -9539,6 +8049,10 @@ snapshots: signal-exit@4.1.0: {} + simple-swizzle@0.2.2: + dependencies: + is-arrayish: 0.3.2 + simple-update-notifier@1.1.0: dependencies: semver: 7.0.0 @@ -9725,6 +8239,8 @@ snapshots: dependencies: b4a: 1.6.6 + text-hex@1.0.0: {} + through@2.3.8: {} tmpl@1.0.5: {} @@ -9795,8 +8311,6 @@ snapshots: type-detect@4.0.8: {} - type-fest@0.20.2: {} - type-fest@0.21.3: {} type-is@1.6.18: @@ -9880,10 +8394,6 @@ snapshots: dependencies: makeerror: 1.0.12 - wcwidth@1.0.1: - dependencies: - defaults: 1.0.4 - web-streams-polyfill@3.3.3: {} web-streams-polyfill@4.0.0-beta.3: {} @@ -9912,12 +8422,26 @@ snapshots: dependencies: isexe: 2.0.0 - winston-transport@4.7.0: + winston-transport@4.8.0: dependencies: - logform: 2.6.0 - readable-stream: 3.6.2 + logform: 2.6.1 + readable-stream: 4.5.2 triple-beam: 1.4.1 + winston@3.14.2: + dependencies: + '@colors/colors': 1.6.0 + '@dabh/diagnostics': 2.0.3 + async: 3.2.5 + is-stream: 2.0.1 + logform: 2.6.0 + one-time: 1.0.0 + readable-stream: 3.6.2 + safe-stable-stringify: 2.4.3 + stack-trace: 0.0.10 + triple-beam: 1.4.1 + winston-transport: 4.8.0 + wordnet-db@3.1.14: {} wordpos@2.1.0: diff --git a/apps/api/requests.http b/apps/api/requests.http index 3e7bd2b7..809bae7b 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,15 +1,15 @@ ### Crawl Website POST http://localhost:3002/v0/scrape HTTP/1.1 -Authorization: Bearer fc- +Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { - "url":"corterix.com" + "url":"firecrawl.dev" } ### Check Job Status GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1 -Authorization: Bearer fc- +Authorization: Bearer {{$dotenv TEST_API_KEY}} ### Check Job Status @@ -18,7 +18,7 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1 ### Scrape Website POST http://localhost:3002/v0/crawl HTTP/1.1 -Authorization: Bearer fc- +Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { @@ -45,7 +45,7 @@ content-type: application/json ### Scrape Website POST http://localhost:3002/v0/scrape HTTP/1.1 -Authorization: Bearer +Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { @@ -56,12 +56,12 @@ content-type: application/json ### Check Job Status GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1 -Authorization: Bearer +Authorization: Bearer {{$dotenv TEST_API_KEY}} ### Get Job Result POST https://api.firecrawl.dev/v0/crawl HTTP/1.1 -Authorization: Bearer +Authorization: Bearer {{$dotenv TEST_API_KEY}} content-type: application/json { @@ -70,7 +70,7 @@ content-type: application/json ### Check Job Status GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66 -Authorization: Bearer +Authorization: Bearer {{$dotenv TEST_API_KEY}} ### Get Active Jobs Count GET http://localhost:3002/serverHealthCheck diff --git a/apps/api/sharedLibs/go-html-to-md/.gitignore b/apps/api/sharedLibs/go-html-to-md/.gitignore new file mode 100644 index 00000000..bdab47c6 --- /dev/null +++ b/apps/api/sharedLibs/go-html-to-md/.gitignore @@ -0,0 +1,2 @@ +html-to-markdown.so +html-to-markdown.h \ No newline at end of file diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index b1708abc..dec77131 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -844,7 +844,7 @@ describe("E2E Tests for API Routes", () => { expect(crawlInitResponse.statusCode).toBe(200); expect(crawlInitResponse.body).toHaveProperty("jobId"); - let crawlStatus: string; + let crawlStatus: string = "scraping"; let crawlData = []; while (crawlStatus !== "completed") { const statusResponse = await request(TEST_URL) diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index acb22780..83f676b8 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -20,7 +20,6 @@ describe("E2E Tests for API Routes with No Authentication", () => { process.env.SCRAPING_BEE_API_KEY = ""; process.env.OPENAI_API_KEY = ""; process.env.BULL_AUTH_KEY = ""; - process.env.LOGTAIL_KEY = ""; process.env.PLAYWRIGHT_MICROSERVICE_URL = ""; process.env.LLAMAPARSE_API_KEY = ""; process.env.TEST_API_KEY = ""; diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index a4163472..8c3d1731 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -1,7 +1,7 @@ import request from "supertest"; import { configDotenv } from "dotenv"; import { - ScrapeRequest, + ScrapeRequestInput, ScrapeResponseRequestTest, } from "../../controllers/v1/types"; @@ -44,7 +44,7 @@ describe("E2E Tests for v1 API Routes", () => { }); it.concurrent("should throw error for blocklisted URL", async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://facebook.com/fake-test", }; @@ -73,7 +73,7 @@ describe("E2E Tests for v1 API Routes", () => { it.concurrent( "should return a successful response with a valid API key", async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://roastmywebsite.ai", }; @@ -125,7 +125,7 @@ describe("E2E Tests for v1 API Routes", () => { it.concurrent( "should return a successful response with a valid API key", async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://arxiv.org/abs/2410.04840", }; @@ -167,7 +167,7 @@ describe("E2E Tests for v1 API Routes", () => { it.concurrent( "should return a successful response with a valid API key and includeHtml set to true", async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://roastmywebsite.ai", formats: ["markdown", "html"], }; @@ -194,7 +194,7 @@ describe("E2E Tests for v1 API Routes", () => { 30000 ); it.concurrent('should return a successful response for a valid scrape with PDF file', async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://arxiv.org/pdf/astro-ph/9301001.pdf" // formats: ["markdown", "html"], }; @@ -217,7 +217,7 @@ describe("E2E Tests for v1 API Routes", () => { }, 60000); it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://arxiv.org/pdf/astro-ph/9301001" }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -240,7 +240,7 @@ describe("E2E Tests for v1 API Routes", () => { }, 60000); it.concurrent("should return a successful response with a valid API key with removeTags option", async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://www.scrapethissite.com/", onlyMainContent: false // default is true }; @@ -261,7 +261,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer - const scrapeRequestWithRemoveTags: ScrapeRequest = { + const scrapeRequestWithRemoveTags: ScrapeRequestInput = { url: "https://www.scrapethissite.com/", excludeTags: ['.nav', '#footer', 'strong'], onlyMainContent: false // default is true @@ -407,7 +407,7 @@ describe("E2E Tests for v1 API Routes", () => { it.concurrent( "should return a successful response with a valid API key and includeHtml set to true", async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://roastmywebsite.ai", formats: ["html","rawHtml"], }; @@ -438,7 +438,7 @@ describe("E2E Tests for v1 API Routes", () => { it.concurrent( "should return a successful response with waitFor", async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://ycombinator.com/companies", formats: ["markdown"], waitFor: 8000 @@ -471,7 +471,7 @@ describe("E2E Tests for v1 API Routes", () => { it.concurrent( "should return a successful response with a valid links on page", async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://roastmywebsite.ai", formats: ["links"], }; @@ -672,7 +672,7 @@ describe("POST /v1/crawl", () => { }); it.concurrent("should throw error for blocklisted URL", async () => { - const scrapeRequest: ScrapeRequest = { + const scrapeRequest: ScrapeRequestInput = { url: "https://facebook.com/fake-test", }; diff --git a/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts new file mode 100644 index 00000000..5c7feb1f --- /dev/null +++ b/apps/api/src/__tests__/e2e_v1_withAuth_all_params/index.test.ts @@ -0,0 +1,603 @@ +import request from "supertest"; +import { configDotenv } from "dotenv"; +import { + ScrapeRequest, + ScrapeResponseRequestTest, +} from "../../controllers/v1/types"; + +configDotenv(); +const FIRECRAWL_API_URL = "http://127.0.0.1:3002"; +const E2E_TEST_SERVER_URL = "http://firecrawl-e2e-test.vercel.app"; // @rafaelsideguide/firecrawl-e2e-test + +describe("E2E Tests for v1 API Routes", () => { + + it.concurrent('should return a successful response for a scrape with 403 page', async () => { + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post('/v1/scrape') + .set('Authorization', `Bearer ${process.env.TEST_API_KEY}`) + .set('Content-Type', 'application/json') + .send({ url: 'https://httpstat.us/403' }); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).toHaveProperty('markdown'); + expect(response.body.data).toHaveProperty('metadata'); + expect(response.body.data.metadata.statusCode).toBe(403); + }, 30000); + + it.concurrent("should handle 'formats:markdown (default)' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + + expect(response.body.data).toHaveProperty("markdown"); + + expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl."); + expect(response.body.data.markdown).toContain("Content with id #content-1"); + // expect(response.body.data.markdown).toContain("Loading..."); + expect(response.body.data.markdown).toContain("Click me!"); + expect(response.body.data.markdown).toContain("Power your AI apps with clean data crawled from any website. It's also open-source."); // firecrawl.dev inside an iframe + expect(response.body.data.markdown).toContain("This content loads only when you see it. Don't blink! 👼"); // the browser always scroll to the bottom + expect(response.body.data.markdown).not.toContain("Header"); // Only main content is returned by default + expect(response.body.data.markdown).not.toContain("footer"); // Only main content is returned by default + expect(response.body.data.markdown).not.toContain("This content is only visible on mobile"); + }, + 30000); + + it.concurrent("should handle 'formats:html' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + formats: ["html"] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + + + expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("html"); + + expect(response.body.data.html).not.toContain("
Header
"); + expect(response.body.data.html).toContain("

This page is used for end-to-end (e2e) testing with Firecrawl.

"); + }, + 30000); + + it.concurrent("should handle 'rawHtml' in 'formats' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + formats: ["rawHtml"] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + + expect(response.body.data).not.toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("rawHtml"); + + expect(response.body.data.rawHtml).toContain(">This page is used for end-to-end (e2e) testing with Firecrawl.

"); + expect(response.body.data.rawHtml).toContain(">Header"); + }, + 30000); + + // - TODO: tests for links + // - TODO: tests for screenshot + // - TODO: tests for screenshot@fullPage + + it.concurrent("should handle 'headers' parameter correctly", async () => { + // @ts-ignore + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + headers: { "e2e-header-test": "firecrawl" } + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + + expect(response.body.data.markdown).toContain("e2e-header-test: firecrawl"); + }, 30000); + + it.concurrent("should handle 'includeTags' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + includeTags: ['#content-1'] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + + expect(response.body.data.markdown).not.toContain("

This page is used for end-to-end (e2e) testing with Firecrawl.

"); + expect(response.body.data.markdown).toContain("Content with id #content-1"); + }, + 30000); + + it.concurrent("should handle 'excludeTags' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + excludeTags: ['#content-1'] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + + expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl."); + expect(response.body.data.markdown).not.toContain("Content with id #content-1"); + }, + 30000); + + it.concurrent("should handle 'onlyMainContent' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + formats: ["html", "markdown"], + onlyMainContent: false + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("data"); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + + expect(response.body.data.markdown).toContain("This page is used for end-to-end (e2e) testing with Firecrawl."); + expect(response.body.data.html).toContain("
Header
"); + }, + 30000); + + it.concurrent("should handle 'timeout' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + timeout: 500 + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(408); + + if (!("error" in response.body)) { + throw new Error("Expected response body to have 'error' property"); + } + expect(response.body.error).toBe("Request timed out"); + expect(response.body.success).toBe(false); + }, 30000); + + + it.concurrent("should handle 'mobile' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + mobile: true + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data.markdown).toContain("This content is only visible on mobile"); + }, + 30000); + + it.concurrent("should handle 'parsePDF' parameter correctly", + async () => { + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf'}); + await new Promise((r) => setTimeout(r, 6000)); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty('data'); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + + expect(response.body.data.markdown).toContain('arXiv:astro-ph/9301001v1 7 Jan 1993'); + expect(response.body.data.markdown).not.toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm'); + + const responseNoParsePDF: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: 'https://arxiv.org/pdf/astro-ph/9301001.pdf', parsePDF: false }); + await new Promise((r) => setTimeout(r, 6000)); + + expect(responseNoParsePDF.statusCode).toBe(200); + expect(responseNoParsePDF.body).toHaveProperty('data'); + if (!("data" in responseNoParsePDF.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(responseNoParsePDF.body.data.markdown).toContain('h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm'); + }, + 30000); + + // it.concurrent("should handle 'location' parameter correctly", + // async () => { + // const scrapeRequest: ScrapeRequest = { + // url: "https://roastmywebsite.ai", + // location: { + // country: "US", + // languages: ["en"] + // } + // }; + + // const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + // .post("/v1/scrape") + // .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + // .set("Content-Type", "application/json") + // .send(scrapeRequest); + + // expect(response.statusCode).toBe(200); + // // Add assertions to verify location is handled correctly + // }, + // 30000); + + it.concurrent("should handle 'skipTlsVerification' parameter correctly", + async () => { + const scrapeRequest = { + url: "https://expired.badssl.com/", + timeout: 120000 + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + console.log("Error1a") + // console.log(response.body) + expect(response.statusCode).toBe(200); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data.metadata.pageStatusCode).toBe(500); + console.log("Error?") + + const scrapeRequestWithSkipTlsVerification = { + url: "https://expired.badssl.com/", + skipTlsVerification: true, + timeout: 120000 + + } as ScrapeRequest; + + const responseWithSkipTlsVerification: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequestWithSkipTlsVerification); + + console.log("Error1b") + // console.log(responseWithSkipTlsVerification.body) + expect(responseWithSkipTlsVerification.statusCode).toBe(200); + if (!("data" in responseWithSkipTlsVerification.body)) { + throw new Error("Expected response body to have 'data' property"); + } + // console.log(responseWithSkipTlsVerification.body.data) + expect(responseWithSkipTlsVerification.body.data.markdown).toContain("badssl.com"); + }, + 60000); + + it.concurrent("should handle 'removeBase64Images' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + removeBase64Images: true + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + // console.log(response.body.data.markdown) + // - TODO: not working for every image + // expect(response.body.data.markdown).toContain("Image-Removed"); + }, + 30000); + + it.concurrent("should handle 'action wait' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + actions: [{ + type: "wait", + milliseconds: 10000 + }] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data.markdown).not.toContain("Loading..."); + expect(response.body.data.markdown).toContain("Content loaded after 5 seconds!"); + }, + 30000); + + // screenshot + it.concurrent("should handle 'action screenshot' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + actions: [{ + type: "screenshot" + }] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + if (!response.body.data.actions?.screenshots) { + throw new Error("Expected response body to have screenshots array"); + } + expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0); + expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-"); + + // TODO compare screenshot with expected screenshot + }, + 30000); + + it.concurrent("should handle 'action screenshot@fullPage' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + actions: [{ + type: "screenshot", + fullPage: true + }, + { + type:"scrape" + }] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + // console.log(response.body.data.actions?.screenshots[0]) + if (!response.body.data.actions?.screenshots) { + throw new Error("Expected response body to have screenshots array"); + } + expect(response.body.data.actions.screenshots[0].length).toBeGreaterThan(0); + expect(response.body.data.actions.screenshots[0]).toContain("https://service.firecrawl.dev/storage/v1/object/public/media/screenshot-"); + + if (!response.body.data.actions?.scrapes) { + throw new Error("Expected response body to have scrapes array"); + } + expect(response.body.data.actions.scrapes[0].url).toBe("https://firecrawl-e2e-test.vercel.app/"); + expect(response.body.data.actions.scrapes[0].html).toContain("This page is used for end-to-end (e2e) testing with Firecrawl.

"); + // TODO compare screenshot with expected full page screenshot + }, + 30000); + + it.concurrent("should handle 'action click' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + actions: [{ + type: "click", + selector: "#click-me" + }] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data.markdown).not.toContain("Click me!"); + expect(response.body.data.markdown).toContain("Text changed after click!"); + }, + 30000); + + it.concurrent("should handle 'action write' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + formats: ["html"], + actions: [{ + type: "click", + selector: "#input-1" + }, + { + type: "write", + text: "Hello, world!" + } + ]} as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + + // TODO: fix this test (need to fix fire-engine first) + // uncomment the following line: + // expect(response.body.data.html).toContain(""); + }, + 30000); + + // TODO: fix this test (need to fix fire-engine first) + it.concurrent("should handle 'action pressKey' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + formats: ["markdown"], + actions: [ + { + type: "press", + key: "ArrowDown" + } + ] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + // // TODO: fix this test (need to fix fire-engine first) + // // right now response.body is: { success: false, error: '(Internal server error) - null' } + // expect(response.statusCode).toBe(200); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // expect(response.body.data.markdown).toContain("Last Key Clicked: ArrowDown") + }, + 30000); + + // TODO: fix this test (need to fix fire-engine first) + it.concurrent("should handle 'action scroll' parameter correctly", + async () => { + const scrapeRequest = { + url: E2E_TEST_SERVER_URL, + formats: ["markdown"], + actions: [ + { + type: "click", + selector: "#scroll-bottom-loader" + }, + { + type: "scroll", + direction: "down", + amount: 2000 + } + ] + } as ScrapeRequest; + + const response: ScrapeResponseRequestTest = await request(FIRECRAWL_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + // TODO: uncomment this tests + // expect(response.statusCode).toBe(200); + // if (!("data" in response.body)) { + // throw new Error("Expected response body to have 'data' property"); + // } + // + // expect(response.body.data.markdown).toContain("You have reached the bottom!") + }, + 30000); + + // TODO: test scrape action + +}); \ No newline at end of file diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 26caf63e..1646843f 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -776,7 +776,8 @@ describe("E2E Tests for v0 API Routes", () => { await new Promise((r) => setTimeout(r, 10000)); const completedResponse = await request(TEST_URL) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .maxResponseSize(4000000000); expect(completedResponse.statusCode).toBe(200); expect(completedResponse.body).toHaveProperty("status"); diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index c9f693c5..de74fed0 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -9,9 +9,8 @@ import { import { supabase_service } from "../services/supabase"; import { withAuth } from "../lib/withAuth"; import { RateLimiterRedis } from "rate-limiter-flexible"; -import { setTraceAttributes } from "@hyperdx/node-opentelemetry"; import { sendNotification } from "../services/notification/email_notification"; -import { Logger } from "../lib/logger"; +import { logger } from "../lib/logger"; import { redlock } from "../services/redlock"; import { deleteKey, getValue } from "../services/redis"; import { setValue } from "../services/redis"; @@ -40,8 +39,8 @@ function normalizedApiIsUuid(potentialUuid: string): boolean { export async function setCachedACUC( api_key: string, acuc: - | AuthCreditUsageChunk - | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk) + | AuthCreditUsageChunk | null + | ((acuc: AuthCreditUsageChunk) => AuthCreditUsageChunk | null) ) { const cacheKeyACUC = `acuc_${api_key}`; const redLockKey = `lock_${cacheKeyACUC}`; @@ -49,7 +48,7 @@ export async function setCachedACUC( try { await redlock.using([redLockKey], 10000, {}, async (signal) => { if (typeof acuc === "function") { - acuc = acuc(JSON.parse(await getValue(cacheKeyACUC))); + acuc = acuc(JSON.parse(await getValue(cacheKeyACUC) ?? "null")); if (acuc === null) { if (signal.aborted) { @@ -69,7 +68,7 @@ export async function setCachedACUC( await setValue(cacheKeyACUC, JSON.stringify(acuc), 600, true); }); } catch (error) { - Logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`); + logger.error(`Error updating cached ACUC ${cacheKeyACUC}: ${error}`); } } @@ -103,7 +102,7 @@ export async function getACUC( break; } - Logger.warn( + logger.warn( `Failed to retrieve authentication and credit usage data after ${retries}, trying again...` ); retries++; @@ -146,33 +145,14 @@ export async function authenticateUser( res, mode?: RateLimiterMode ): Promise { - return withAuth(supaAuthenticateUser)(req, res, mode); -} - -function setTrace(team_id: string, api_key: string) { - try { - setTraceAttributes({ - team_id, - api_key, - }); - } catch (error) { - Sentry.captureException(error); - Logger.error(`Error setting trace attributes: ${error.message}`); - } + return withAuth(supaAuthenticateUser, { success: true, chunk: null, team_id: "bypass" })(req, res, mode); } export async function supaAuthenticateUser( req, res, mode?: RateLimiterMode -): Promise<{ - success: boolean; - team_id?: string; - error?: string; - status?: number; - plan?: PlanType; - chunk?: AuthCreditUsageChunk; -}> { +): Promise { const authHeader = req.headers.authorization ?? (req.headers["sec-websocket-protocol"] @@ -200,7 +180,7 @@ export async function supaAuthenticateUser( let teamId: string | null = null; let priceId: string | null = null; - let chunk: AuthCreditUsageChunk; + let chunk: AuthCreditUsageChunk | null = null; if (token == "this_is_just_a_preview_token") { if (mode == RateLimiterMode.CrawlStatus) { @@ -233,8 +213,6 @@ export async function supaAuthenticateUser( priceId = chunk.price_id; const plan = getPlanByPriceId(priceId); - // HyperDX Logging - setTrace(teamId, normalizedApi); subscriptionData = { team_id: teamId, plan, @@ -291,7 +269,7 @@ export async function supaAuthenticateUser( try { await rateLimiter.consume(team_endpoint_token); } catch (rateLimiterRes) { - Logger.error(`Rate limit exceeded: ${rateLimiterRes}`); + logger.error(`Rate limit exceeded: ${rateLimiterRes}`); const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1; const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext); @@ -318,7 +296,7 @@ export async function supaAuthenticateUser( mode === RateLimiterMode.CrawlStatus || mode === RateLimiterMode.Search) ) { - return { success: true, team_id: "preview" }; + return { success: true, team_id: "preview", chunk: null }; // check the origin of the request and make sure its from firecrawl.dev // const origin = req.headers.origin; // if (origin && origin.includes("firecrawl.dev")){ @@ -333,12 +311,12 @@ export async function supaAuthenticateUser( return { success: true, - team_id: subscriptionData.team_id, - plan: (subscriptionData.plan ?? "") as PlanType, + team_id: teamId ?? undefined, + plan: (subscriptionData?.plan ?? "") as PlanType, chunk, }; } -function getPlanByPriceId(price_id: string): PlanType { +function getPlanByPriceId(price_id: string | null): PlanType { switch (price_id) { case process.env.STRIPE_PRICE_ID_STARTER: return "starter"; diff --git a/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts index 876ca98a..75acd60a 100644 --- a/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts +++ b/apps/api/src/controllers/v0/admin/acuc-cache-clear.ts @@ -1,7 +1,7 @@ import { Request, Response } from "express"; import { supabase_service } from "../../../services/supabase"; import { clearACUC } from "../../auth"; -import { Logger } from "../../../lib/logger"; +import { logger } from "../../../lib/logger"; export async function acucCacheClearController(req: Request, res: Response) { try { @@ -12,11 +12,11 @@ export async function acucCacheClearController(req: Request, res: Response) { .select("*") .eq("team_id", team_id); - await Promise.all(keys.data.map((x) => clearACUC(x.key))); + await Promise.all((keys.data ?? []).map((x) => clearACUC(x.key))); res.json({ ok: true }); } catch (error) { - Logger.error(`Error clearing ACUC cache via API route: ${error}`); + logger.error(`Error clearing ACUC cache via API route: ${error}`); res.status(500).json({ error: "Internal server error" }); } } diff --git a/apps/api/src/controllers/v0/admin/queue.ts b/apps/api/src/controllers/v0/admin/queue.ts index 71748002..6ef8a992 100644 --- a/apps/api/src/controllers/v0/admin/queue.ts +++ b/apps/api/src/controllers/v0/admin/queue.ts @@ -1,7 +1,7 @@ import { Request, Response } from "express"; import { Job } from "bullmq"; -import { Logger } from "../../../lib/logger"; +import { logger } from "../../../lib/logger"; import { getScrapeQueue } from "../../../services/queue-service"; import { checkAlerts } from "../../../services/alerts"; import { sendSlackWebhook } from "../../../services/alerts/slack"; @@ -10,7 +10,7 @@ export async function cleanBefore24hCompleteJobsController( req: Request, res: Response ) { - Logger.info("🐂 Cleaning jobs older than 24h"); + logger.info("🐂 Cleaning jobs older than 24h"); try { const scrapeQueue = getScrapeQueue(); const batchSize = 10; @@ -31,7 +31,7 @@ export async function cleanBefore24hCompleteJobsController( ).flat(); const before24hJobs = completedJobs.filter( - (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 + (job) => job.finishedOn !== undefined && job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 ) || []; let count = 0; @@ -45,12 +45,12 @@ export async function cleanBefore24hCompleteJobsController( await job.remove(); count++; } catch (jobError) { - Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`); + logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`); } } return res.status(200).send(`Removed ${count} completed jobs.`); } catch (error) { - Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`); + logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`); return res.status(500).send("Failed to clean jobs"); } } @@ -60,7 +60,7 @@ export async function checkQueuesController(req: Request, res: Response) { await checkAlerts(); return res.status(200).send("Alerts initialized"); } catch (error) { - Logger.debug(`Failed to initialize alerts: ${error}`); + logger.debug(`Failed to initialize alerts: ${error}`); return res.status(500).send("Failed to initialize alerts"); } } @@ -81,7 +81,7 @@ export async function queuesController(req: Request, res: Response) { noActiveJobs, }); } catch (error) { - Logger.error(error); + logger.error(error); return res.status(500).json({ error: error.message }); } } @@ -165,7 +165,7 @@ export async function autoscalerController(req: Request, res: Response) { } if (targetMachineCount !== activeMachines) { - Logger.info( + logger.info( `🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting` ); @@ -193,7 +193,7 @@ export async function autoscalerController(req: Request, res: Response) { count: activeMachines, }); } catch (error) { - Logger.error(error); + logger.error(error); return res.status(500).send("Failed to initialize autoscaler"); } } diff --git a/apps/api/src/controllers/v0/admin/redis-health.ts b/apps/api/src/controllers/v0/admin/redis-health.ts index dc58d745..dc587606 100644 --- a/apps/api/src/controllers/v0/admin/redis-health.ts +++ b/apps/api/src/controllers/v0/admin/redis-health.ts @@ -1,6 +1,6 @@ import { Request, Response } from "express"; import Redis from "ioredis"; -import { Logger } from "../../../lib/logger"; +import { logger } from "../../../lib/logger"; import { redisRateLimitClient } from "../../../services/rate-limiter"; export async function redisHealthController(req: Request, res: Response) { @@ -10,14 +10,14 @@ export async function redisHealthController(req: Request, res: Response) { return await operation(); } catch (error) { if (attempt === retries) throw error; - Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`); + logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`); await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying } } }; try { - const queueRedis = new Redis(process.env.REDIS_URL); + const queueRedis = new Redis(process.env.REDIS_URL!); const testKey = "test"; const testValue = "test"; @@ -29,7 +29,7 @@ export async function redisHealthController(req: Request, res: Response) { queueRedisHealth = await retryOperation(() => queueRedis.get(testKey)); await retryOperation(() => queueRedis.del(testKey)); } catch (error) { - Logger.error(`queueRedis health check failed: ${error}`); + logger.error(`queueRedis health check failed: ${error}`); queueRedisHealth = null; } @@ -42,7 +42,7 @@ export async function redisHealthController(req: Request, res: Response) { ); await retryOperation(() => redisRateLimitClient.del(testKey)); } catch (error) { - Logger.error(`redisRateLimitClient health check failed: ${error}`); + logger.error(`redisRateLimitClient health check failed: ${error}`); redisRateLimitHealth = null; } @@ -56,10 +56,10 @@ export async function redisHealthController(req: Request, res: Response) { healthStatus.queueRedis === "healthy" && healthStatus.redisRateLimitClient === "healthy" ) { - Logger.info("Both Redis instances are healthy"); + logger.info("Both Redis instances are healthy"); return res.status(200).json({ status: "healthy", details: healthStatus }); } else { - Logger.info( + logger.info( `Redis instances health check: ${JSON.stringify(healthStatus)}` ); // await sendSlackWebhook( @@ -73,7 +73,7 @@ export async function redisHealthController(req: Request, res: Response) { .json({ status: "unhealthy", details: healthStatus }); } } catch (error) { - Logger.error(`Redis health check failed: ${error}`); + logger.error(`Redis health check failed: ${error}`); // await sendSlackWebhook( // `[REDIS DOWN] Redis instances health check: ${error.message}`, // true diff --git a/apps/api/src/controllers/v0/crawl-cancel.ts b/apps/api/src/controllers/v0/crawl-cancel.ts index efcd454a..e81064f2 100644 --- a/apps/api/src/controllers/v0/crawl-cancel.ts +++ b/apps/api/src/controllers/v0/crawl-cancel.ts @@ -2,7 +2,7 @@ import { Request, Response } from "express"; import { authenticateUser } from "../auth"; import { RateLimiterMode } from "../../../src/types"; import { supabase_service } from "../../../src/services/supabase"; -import { Logger } from "../../../src/lib/logger"; +import { logger } from "../../../src/lib/logger"; import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis"; import * as Sentry from "@sentry/node"; import { configDotenv } from "dotenv"; @@ -12,15 +12,17 @@ export async function crawlCancelController(req: Request, res: Response) { try { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; - const { success, team_id, error, status } = await authenticateUser( + const auth = await authenticateUser( req, res, RateLimiterMode.CrawlStatus ); - if (!success) { - return res.status(status).json({ error }); + if (!auth.success) { + return res.status(auth.status).json({ error: auth.error }); } + const { team_id } = auth; + const sc = await getCrawl(req.params.jobId); if (!sc) { return res.status(404).json({ error: "Job not found" }); @@ -46,7 +48,7 @@ export async function crawlCancelController(req: Request, res: Response) { sc.cancelled = true; await saveCrawl(req.params.jobId, sc); } catch (error) { - Logger.error(error); + logger.error(error); } res.json({ @@ -54,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) { }); } catch (error) { Sentry.captureException(error); - Logger.error(error); + logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index 66522bcf..7b6e610a 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -2,15 +2,17 @@ import { Request, Response } from "express"; import { authenticateUser } from "../auth"; import { RateLimiterMode } from "../../../src/types"; import { getScrapeQueue } from "../../../src/services/queue-service"; -import { Logger } from "../../../src/lib/logger"; +import { logger } from "../../../src/lib/logger"; import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs"; import * as Sentry from "@sentry/node"; import { configDotenv } from "dotenv"; +import { Job } from "bullmq"; +import { toLegacyDocument } from "../v1/types"; configDotenv(); export async function getJobs(crawlId: string, ids: string[]) { - const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); + const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as Job[]; if (process.env.USE_DB_AUTHENTICATION === "true") { const supabaseData = await supabaseGetJobsByCrawlId(crawlId); @@ -32,15 +34,17 @@ export async function getJobs(crawlId: string, ids: string[]) { export async function crawlStatusController(req: Request, res: Response) { try { - const { success, team_id, error, status } = await authenticateUser( + const auth = await authenticateUser( req, res, RateLimiterMode.CrawlStatus ); - if (!success) { - return res.status(status).json({ error }); + if (!auth.success) { + return res.status(auth.status).json({ error: auth.error }); } + const { team_id } = auth; + const sc = await getCrawl(req.params.jobId); if (!sc) { return res.status(404).json({ error: "Job not found" }); @@ -90,12 +94,12 @@ export async function crawlStatusController(req: Request, res: Response) { status: jobStatus, current: jobStatuses.filter(x => x === "completed" || x === "failed").length, total: jobs.length, - data: jobStatus === "completed" ? data : null, - partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null), + data: jobStatus === "completed" ? data.map(x => toLegacyDocument(x, sc.internalOptions)) : null, + partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null).map(x => toLegacyDocument(x, sc.internalOptions)), }); } catch (error) { Sentry.captureException(error); - Logger.error(error); + logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index fb412147..d502d142 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -9,24 +9,28 @@ import { validateIdempotencyKey } from "../../../src/services/idempotency/valida import { createIdempotencyKey } from "../../../src/services/idempotency/create"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../../../src/lib/logger"; +import { logger } from "../../../src/lib/logger"; import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis"; import { getScrapeQueue } from "../../../src/services/queue-service"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; +import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types"; +import { ZodError } from "zod"; export async function crawlController(req: Request, res: Response) { try { - const { success, team_id, error, status, plan, chunk } = await authenticateUser( + const auth = await authenticateUser( req, res, RateLimiterMode.Crawl ); - if (!success) { - return res.status(status).json({ error }); + if (!auth.success) { + return res.status(auth.status).json({ error: auth.error }); } + const { team_id, plan, chunk } = auth; + if (req.headers["x-idempotency-key"]) { const isIdempotencyValid = await validateIdempotencyKey(req); if (!isIdempotencyValid) { @@ -35,7 +39,7 @@ export async function crawlController(req: Request, res: Response) { try { createIdempotencyKey(req); } catch (error) { - Logger.error(error); + logger.error(error); return res.status(500).json({ error: error.message }); } } @@ -77,7 +81,7 @@ export async function crawlController(req: Request, res: Response) { // TODO: need to do this to v1 crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); - let url = req.body.url; + let url = urlSchema.parse(req.body.url); if (!url) { return res.status(400).json({ error: "Url is required" }); } @@ -123,7 +127,7 @@ export async function crawlController(req: Request, res: Response) { // documents: docs, // }); // } catch (error) { - // Logger.error(error); + // logger.error(error); // return res.status(500).json({ error: error.message }); // } // } @@ -132,10 +136,13 @@ export async function crawlController(req: Request, res: Response) { await logCrawl(id, team_id); + const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined); + const sc: StoredCrawl = { originUrl: url, crawlerOptions, - pageOptions, + scrapeOptions, + internalOptions, team_id, plan, createdAt: Date.now(), @@ -170,10 +177,11 @@ export async function crawlController(req: Request, res: Response) { data: { url, mode: "single_urls", - crawlerOptions: crawlerOptions, + crawlerOptions, + scrapeOptions, + internalOptions, team_id, plan, - pageOptions: pageOptions, origin: req.body.origin ?? defaultOrigin, crawl_id: id, sitemapped: true, @@ -208,10 +216,11 @@ export async function crawlController(req: Request, res: Response) { { url, mode: "single_urls", - crawlerOptions: crawlerOptions, + crawlerOptions, + scrapeOptions, + internalOptions, team_id, - plan, - pageOptions: pageOptions, + plan: plan!, origin: req.body.origin ?? defaultOrigin, crawl_id: id, }, @@ -226,7 +235,9 @@ export async function crawlController(req: Request, res: Response) { res.json({ jobId: id }); } catch (error) { Sentry.captureException(error); - Logger.error(error); - return res.status(500).json({ error: error.message }); + logger.error(error); + return res.status(500).json({ error: error instanceof ZodError + ? "Invalid URL" + : error.message }); } } diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index e9f6e806..8b82bef8 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -3,15 +3,16 @@ import { authenticateUser } from "../auth"; import { RateLimiterMode } from "../../../src/types"; import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../../../src/lib/logger"; +import { logger } from "../../../src/lib/logger"; import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis"; import { addScrapeJob } from "../../../src/services/queue-jobs"; import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import * as Sentry from "@sentry/node"; +import { fromLegacyScrapeOptions } from "../v1/types"; export async function crawlPreviewController(req: Request, res: Response) { try { - const { success, error, status, team_id:a, plan } = await authenticateUser( + const auth = await authenticateUser( req, res, RateLimiterMode.Preview @@ -19,10 +20,12 @@ export async function crawlPreviewController(req: Request, res: Response) { const team_id = "preview"; - if (!success) { - return res.status(status).json({ error }); + if (!auth.success) { + return res.status(auth.status).json({ error: auth.error }); } + const { plan } = auth; + let url = req.body.url; if (!url) { return res.status(400).json({ error: "Url is required" }); @@ -71,7 +74,7 @@ export async function crawlPreviewController(req: Request, res: Response) { // documents: docs, // }); // } catch (error) { - // Logger.error(error); + // logger.error(error); // return res.status(500).json({ error: error.message }); // } // } @@ -84,10 +87,13 @@ export async function crawlPreviewController(req: Request, res: Response) { robots = await this.getRobotsTxt(); } catch (_) {} + const { scrapeOptions, internalOptions } = fromLegacyScrapeOptions(pageOptions, undefined, undefined); + const sc: StoredCrawl = { originUrl: url, crawlerOptions, - pageOptions, + scrapeOptions, + internalOptions, team_id, plan, robots, @@ -107,10 +113,11 @@ export async function crawlPreviewController(req: Request, res: Response) { await addScrapeJob({ url, mode: "single_urls", - crawlerOptions: crawlerOptions, team_id, - plan, - pageOptions: pageOptions, + plan: plan!, + crawlerOptions, + scrapeOptions, + internalOptions, origin: "website-preview", crawl_id: id, sitemapped: true, @@ -123,10 +130,11 @@ export async function crawlPreviewController(req: Request, res: Response) { await addScrapeJob({ url, mode: "single_urls", - crawlerOptions: crawlerOptions, team_id, - plan, - pageOptions: pageOptions, + plan: plan!, + crawlerOptions, + scrapeOptions, + internalOptions, origin: "website-preview", crawl_id: id, }, {}, jobId); @@ -136,7 +144,7 @@ export async function crawlPreviewController(req: Request, res: Response) { res.json({ jobId: id }); } catch (error) { Sentry.captureException(error); - Logger.error(error); + logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/v0/keyAuth.ts b/apps/api/src/controllers/v0/keyAuth.ts index b70d672a..63915302 100644 --- a/apps/api/src/controllers/v0/keyAuth.ts +++ b/apps/api/src/controllers/v0/keyAuth.ts @@ -8,13 +8,14 @@ import { authenticateUser } from "../auth"; export const keyAuthController = async (req: Request, res: Response) => { try { // make sure to authenticate user first, Bearer - const { success, team_id, error, status } = await authenticateUser( + const auth = await authenticateUser( req, res ); - if (!success) { - return res.status(status).json({ error }); + if (!auth.success) { + return res.status(auth.status).json({ error: auth.error }); } + // if success, return success: true return res.status(200).json({ success: true }); } catch (error) { diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 5e6b7c6f..05fb3c41 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -7,7 +7,7 @@ import { import { authenticateUser } from "../auth"; import { PlanType, RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; -import { Document } from "../../lib/entities"; +import { Document, fromLegacyCombo, toLegacyDocument, url as urlSchema } from "../v1/types"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; import { @@ -19,9 +19,11 @@ import { import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { getScrapeQueue } from "../../services/queue-service"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../../lib/logger"; +import { logger } from "../../lib/logger"; import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; +import { fromLegacyScrapeOptions } from "../v1/types"; +import { ZodError } from "zod"; export async function scrapeHelper( jobId: string, @@ -35,10 +37,10 @@ export async function scrapeHelper( ): Promise<{ success: boolean; error?: string; - data?: Document; + data?: Document | { url: string }; returnCode: number; }> { - const url = req.body.url; + const url = urlSchema.parse(req.body.url); if (typeof url !== "string") { return { success: false, error: "Url is required", returnCode: 400 }; } @@ -54,15 +56,16 @@ export async function scrapeHelper( const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 }); + const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, extractorOptions, timeout, crawlerOptions); + await addScrapeJob( { url, mode: "single_urls", - crawlerOptions, team_id, - pageOptions, - plan, - extractorOptions, + scrapeOptions, + internalOptions, + plan: plan!, origin: req.body.origin ?? defaultOrigin, is_scrape: true, }, @@ -81,7 +84,7 @@ export async function scrapeHelper( }, async (span) => { try { - doc = (await waitForJob(jobId, timeout))[0]; + doc = (await waitForJob(jobId, timeout)); } catch (e) { if (e instanceof Error && e.message.startsWith("Job wait")) { span.setAttribute("timedOut", true); @@ -149,7 +152,7 @@ export async function scrapeHelper( return { success: true, - data: doc, + data: toLegacyDocument(doc, internalOptions), returnCode: 200, }; } @@ -158,15 +161,17 @@ export async function scrapeController(req: Request, res: Response) { try { let earlyReturn = false; // make sure to authenticate user first, Bearer - const { success, team_id, error, status, plan, chunk } = await authenticateUser( + const auth = await authenticateUser( req, res, RateLimiterMode.Scrape ); - if (!success) { - return res.status(status).json({ error }); + if (!auth.success) { + return res.status(auth.status).json({ error: auth.error }); } + const { team_id, plan, chunk } = auth; + const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; const extractorOptions = { @@ -200,7 +205,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(402).json({ error: "Insufficient credits. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing" }); } } catch (error) { - Logger.error(error); + logger.error(error); earlyReturn = true; return res.status(500).json({ error: @@ -224,8 +229,8 @@ export async function scrapeController(req: Request, res: Response) { const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; const numTokens = - result.data && result.data.markdown - ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") + result.data && (result.data as Document).markdown + ? numTokensFromString((result.data as Document).markdown!, "gpt-3.5-turbo") : 0; if (result.success) { @@ -246,7 +251,7 @@ export async function scrapeController(req: Request, res: Response) { if (creditsToBeBilled > 0) { // billing for doc done on queue end, bill only for llm extraction billTeam(team_id, chunk?.sub_id, creditsToBeBilled).catch(error => { - Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`); + logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`); // Optionally, you could notify an admin or add to a retry queue here }); } @@ -254,17 +259,19 @@ export async function scrapeController(req: Request, res: Response) { let doc = result.data; if (!pageOptions || !pageOptions.includeRawHtml) { - if (doc && doc.rawHtml) { - delete doc.rawHtml; + if (doc && (doc as Document).rawHtml) { + delete (doc as Document).rawHtml; } } if(pageOptions && pageOptions.includeExtract) { - if(!pageOptions.includeMarkdown && doc && doc.markdown) { - delete doc.markdown; + if(!pageOptions.includeMarkdown && doc && (doc as Document).markdown) { + delete (doc as Document).markdown; } } + const { scrapeOptions } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout); + logJob({ job_id: jobId, success: result.success, @@ -276,21 +283,22 @@ export async function scrapeController(req: Request, res: Response) { mode: "scrape", url: req.body.url, crawlerOptions: crawlerOptions, - pageOptions: pageOptions, + scrapeOptions, origin: origin, - extractor_options: extractorOptions, num_tokens: numTokens, }); return res.status(result.returnCode).json(result); } catch (error) { Sentry.captureException(error); - Logger.error(error); + logger.error(error); return res.status(500).json({ error: - typeof error === "string" - ? error - : error?.message ?? "Internal Server Error", + error instanceof ZodError + ? "Invalid URL" + : typeof error === "string" + ? error + : error?.message ?? "Internal Server Error", }); } } diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 67cff8eb..e0102406 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -1,5 +1,4 @@ import { Request, Response } from "express"; -import { WebScraperDataProvider } from "../../scraper/WebScraper"; import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; import { authenticateUser } from "../auth"; import { PlanType, RateLimiterMode } from "../../types"; @@ -8,21 +7,23 @@ import { PageOptions, SearchOptions } from "../../lib/entities"; import { search } from "../../search"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../../lib/logger"; +import { logger } from "../../lib/logger"; import { getScrapeQueue } from "../../services/queue-service"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; +import { Job } from "bullmq"; +import { Document, fromLegacyCombo, fromLegacyScrapeOptions, toLegacyDocument } from "../v1/types"; export async function searchHelper( jobId: string, req: Request, team_id: string, - subscription_id: string, + subscription_id: string | null | undefined, crawlerOptions: any, pageOptions: PageOptions, searchOptions: SearchOptions, - plan: PlanType + plan: PlanType | undefined ): Promise<{ success: boolean; error?: string; @@ -35,8 +36,8 @@ export async function searchHelper( return { success: false, error: "Query is required", returnCode: 400 }; } - const tbs = searchOptions.tbs ?? null; - const filter = searchOptions.filter ?? null; + const tbs = searchOptions.tbs ?? undefined; + const filter = searchOptions.filter ?? undefined; let num_results = Math.min(searchOptions.limit ?? 7, 10); if (team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") { @@ -57,11 +58,12 @@ export async function searchHelper( }); let justSearch = pageOptions.fetchPageContent === false; - + + const { scrapeOptions, internalOptions } = fromLegacyCombo(pageOptions, undefined, 60000, crawlerOptions); if (justSearch) { billTeam(team_id, subscription_id, res.length).catch(error => { - Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`); + logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`); // Optionally, you could notify an admin or add to a retry queue here }); return { success: true, data: res, returnCode: 200 }; @@ -88,9 +90,9 @@ export async function searchHelper( data: { url, mode: "single_urls", - crawlerOptions: crawlerOptions, team_id: team_id, - pageOptions: pageOptions, + scrapeOptions, + internalOptions, }, opts: { jobId: uuid, @@ -104,7 +106,7 @@ export async function searchHelper( await addScrapeJob(job.data as any, {}, job.opts.jobId, job.opts.priority) } - const docs = (await Promise.all(jobDatas.map(x => waitForJob(x.opts.jobId, 60000)))).map(x => x[0]); + const docs = (await Promise.all(jobDatas.map(x => waitForJob(x.opts.jobId, 60000)))).map(x => toLegacyDocument(x, internalOptions)); if (docs.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; @@ -115,7 +117,7 @@ export async function searchHelper( // make sure doc.content is not empty const filteredDocs = docs.filter( - (doc: { content?: string }) => doc && doc.content && doc.content.trim().length > 0 + (doc: any) => doc && doc.content && doc.content.trim().length > 0 ); if (filteredDocs.length === 0) { @@ -132,14 +134,15 @@ export async function searchHelper( export async function searchController(req: Request, res: Response) { try { // make sure to authenticate user first, Bearer - const { success, team_id, error, status, plan, chunk } = await authenticateUser( + const auth = await authenticateUser( req, res, RateLimiterMode.Search ); - if (!success) { - return res.status(status).json({ error }); + if (!auth.success) { + return res.status(auth.status).json({ error: auth.error }); } + const { team_id, plan, chunk } = auth; const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { includeHtml: req.body.pageOptions?.includeHtml ?? false, @@ -162,7 +165,7 @@ export async function searchController(req: Request, res: Response) { } } catch (error) { Sentry.captureException(error); - Logger.error(error); + logger.error(error); return res.status(500).json({ error: "Internal server error" }); } const startTime = new Date().getTime(); @@ -189,7 +192,6 @@ export async function searchController(req: Request, res: Response) { mode: "search", url: req.body.query, crawlerOptions: crawlerOptions, - pageOptions: pageOptions, origin: origin, }); return res.status(result.returnCode).json(result); @@ -199,7 +201,7 @@ export async function searchController(req: Request, res: Response) { } Sentry.captureException(error); - Logger.error(error); + logger.error("Unhandled error occurred in search", { error }); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/v0/status.ts b/apps/api/src/controllers/v0/status.ts index bf8d2834..c5eafc2d 100644 --- a/apps/api/src/controllers/v0/status.ts +++ b/apps/api/src/controllers/v0/status.ts @@ -1,5 +1,5 @@ import { Request, Response } from "express"; -import { Logger } from "../../../src/lib/logger"; +import { logger } from "../../../src/lib/logger"; import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; import { getJobs } from "./crawl-status"; import * as Sentry from "@sentry/node"; @@ -37,7 +37,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons }); } catch (error) { Sentry.captureException(error); - Logger.error(error); + logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 286163df..9c6a288c 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -4,8 +4,6 @@ import { BatchScrapeRequest, batchScrapeRequestSchema, CrawlResponse, - legacyExtractorOptions, - legacyScrapeOptions, RequestWithAuth, } from "./types"; import { @@ -29,19 +27,16 @@ export async function batchScrapeController( await logCrawl(id, req.auth.team_id); - let { remainingCredits } = req.account; + let { remainingCredits } = req.account!; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; if(!useDbAuthentication){ remainingCredits = Infinity; } - const pageOptions = legacyScrapeOptions(req.body); - const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined; - - const sc: StoredCrawl = { crawlerOptions: null, - pageOptions, + scrapeOptions: req.body, + internalOptions: {}, team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, @@ -64,10 +59,9 @@ export async function batchScrapeController( url: x, mode: "single_urls" as const, team_id: req.auth.team_id, - plan: req.auth.plan, + plan: req.auth.plan!, crawlerOptions: null, - pageOptions, - extractorOptions, + scrapeOptions: req.body, origin: "api", crawl_id: id, sitemapped: true, diff --git a/apps/api/src/controllers/v1/crawl-cancel.ts b/apps/api/src/controllers/v1/crawl-cancel.ts index f8fba824..958318b5 100644 --- a/apps/api/src/controllers/v1/crawl-cancel.ts +++ b/apps/api/src/controllers/v1/crawl-cancel.ts @@ -1,6 +1,6 @@ import { Response } from "express"; import { supabase_service } from "../../services/supabase"; -import { Logger } from "../../lib/logger"; +import { logger } from "../../lib/logger"; import { getCrawl, saveCrawl } from "../../lib/crawl-redis"; import * as Sentry from "@sentry/node"; import { configDotenv } from "dotenv"; @@ -36,7 +36,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string sc.cancelled = true; await saveCrawl(req.params.jobId, sc); } catch (error) { - Logger.error(error); + logger.error(error); } res.json({ @@ -44,7 +44,7 @@ export async function crawlCancelController(req: RequestWithAuth<{ jobId: string }); } catch (error) { Sentry.captureException(error); - Logger.error(error); + logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/v1/crawl-status-ws.ts b/apps/api/src/controllers/v1/crawl-status-ws.ts index 3738e3a2..18222edc 100644 --- a/apps/api/src/controllers/v1/crawl-status-ws.ts +++ b/apps/api/src/controllers/v1/crawl-status-ws.ts @@ -1,14 +1,15 @@ import { authMiddleware } from "../../routes/v1"; import { RateLimiterMode } from "../../types"; import { authenticateUser } from "../auth"; -import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types"; +import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, RequestWithAuth } from "./types"; import { WebSocket } from "ws"; import { v4 as uuidv4 } from "uuid"; -import { Logger } from "../../lib/logger"; +import { logger } from "../../lib/logger"; import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis"; import { getScrapeQueue } from "../../services/queue-service"; import { getJob, getJobs } from "./crawl-status"; import * as Sentry from "@sentry/node"; +import { Job, JobState } from "bullmq"; type ErrorMessage = { type: "error", @@ -56,7 +57,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth { @@ -70,15 +71,14 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth !doneJobIDs.includes(x)); const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)])); - const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]); - - for (const jobID of newlyDoneJobIDs) { - const job = await getJob(jobID); + const newlyDoneJobIDs: string[] = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]); + const newlyDoneJobs: Job[] = (await Promise.all(newlyDoneJobIDs.map(x => getJob(x)))).filter(x => x !== undefined) as Job[] + for (const job of newlyDoneJobs) { if (job.returnvalue) { send(ws, { type: "document", - data: legacyDocumentConverter(job.returnvalue), + data: job.returnvalue, }) } else { return close(ws, 3000, { type: "error", error: job.failedReason }); @@ -100,8 +100,8 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth legacyDocumentConverter(x)), + data: data, } }); @@ -139,19 +139,21 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth) { try { - const { success, team_id, error, status, plan } = await authenticateUser( + const auth = await authenticateUser( req, null, RateLimiterMode.CrawlStatus, ); - if (!success) { + if (!auth.success) { return close(ws, 3000, { type: "error", - error, + error: auth.error, }); } + const { team_id, plan } = auth; + req.auth = { team_id, plan }; await crawlStatusWS(ws, req); @@ -170,7 +172,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut } } - Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose); + logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose); return close(ws, 1011, { type: "error", error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index a8d78293..f150ddc4 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -1,9 +1,10 @@ import { Response } from "express"; -import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types"; +import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, RequestWithAuth } from "./types"; import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, getThrottledJobs } from "../../lib/crawl-redis"; import { getScrapeQueue } from "../../services/queue-service"; import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs"; import { configDotenv } from "dotenv"; +import { Job, JobState } from "bullmq"; configDotenv(); export async function getJob(id: string) { @@ -24,7 +25,7 @@ export async function getJob(id: string) { } export async function getJobs(ids: string[]) { - const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); + const jobs: (Job & { id: string })[] = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x) as (Job & {id: string})[]; if (process.env.USE_DB_AUTHENTICATION === "true") { const supabaseData = await supabaseGetJobsById(ids); @@ -63,8 +64,8 @@ export async function crawlStatusController(req: RequestWithAuth 0) { - if (!doneJobs[0].data.pageOptions.includeRawHtml) { + if (!doneJobs[0].data.scrapeOptions.formats.includes("rawHtml")) { for (let ii = 0; ii < doneJobs.length; ii++) { if (data[ii]) { delete data[ii].rawHtml; @@ -142,7 +143,7 @@ export async function crawlStatusController(req: RequestWithAuth legacyDocumentConverter(x)), + data: data, }); } diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 53cc04e8..f9f60e71 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -4,9 +4,8 @@ import { CrawlRequest, crawlRequestSchema, CrawlResponse, - legacyCrawlerOptions, - legacyScrapeOptions, RequestWithAuth, + toLegacyCrawlerOptions, } from "./types"; import { addCrawlJob, @@ -20,9 +19,10 @@ import { import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; import { addScrapeJob } from "../../services/queue-jobs"; -import { Logger } from "../../lib/logger"; +import { logger } from "../../lib/logger"; import { getJobPriority } from "../../lib/job-priority"; import { callWebhook } from "../../services/webhook"; +import { scrapeOptions as scrapeOptionsSchema } from "./types"; export async function crawlController( req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, @@ -34,18 +34,22 @@ export async function crawlController( await logCrawl(id, req.auth.team_id); - let { remainingCredits } = req.account; + let { remainingCredits } = req.account!; const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; if(!useDbAuthentication){ remainingCredits = Infinity; } - const crawlerOptions = legacyCrawlerOptions(req.body); - const pageOptions = legacyScrapeOptions(req.body.scrapeOptions); + const crawlerOptions = { + ...req.body, + url: undefined, + scrapeOptions: undefined, + }; + const scrapeOptions = req.body.scrapeOptions; // TODO: @rafa, is this right? copied from v0 - if (Array.isArray(crawlerOptions.includes)) { - for (const x of crawlerOptions.includes) { + if (Array.isArray(crawlerOptions.includePaths)) { + for (const x of crawlerOptions.includePaths) { try { new RegExp(x); } catch (e) { @@ -54,8 +58,8 @@ export async function crawlController( } } - if (Array.isArray(crawlerOptions.excludes)) { - for (const x of crawlerOptions.excludes) { + if (Array.isArray(crawlerOptions.excludePaths)) { + for (const x of crawlerOptions.excludePaths) { try { new RegExp(x); } catch (e) { @@ -68,8 +72,9 @@ export async function crawlController( const sc: StoredCrawl = { originUrl: req.body.url, - crawlerOptions, - pageOptions, + crawlerOptions: toLegacyCrawlerOptions(crawlerOptions), + scrapeOptions, + internalOptions: {}, team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, @@ -78,9 +83,9 @@ export async function crawlController( const crawler = crawlToCrawler(id, sc); try { - sc.robots = await crawler.getRobotsTxt(pageOptions.skipTlsVerification); + sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification); } catch (e) { - Logger.debug( + logger.debug( `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify( e )}` @@ -112,7 +117,7 @@ export async function crawlController( team_id: req.auth.team_id, plan: req.auth.plan, crawlerOptions, - pageOptions, + scrapeOptions, origin: "api", crawl_id: id, sitemapped: true, @@ -142,10 +147,10 @@ export async function crawlController( { url: req.body.url, mode: "single_urls", - crawlerOptions: crawlerOptions, team_id: req.auth.team_id, - plan: req.auth.plan, - pageOptions: pageOptions, + crawlerOptions, + scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions), + plan: req.auth.plan!, origin: "api", crawl_id: id, webhook: req.body.webhook, diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 5ed3dd51..45856543 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -1,9 +1,9 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; import { - legacyCrawlerOptions, mapRequestSchema, RequestWithAuth, + scrapeOptions, } from "./types"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { MapResponse, MapRequest } from "./types"; @@ -18,11 +18,11 @@ import { fireEngineMap } from "../../search/fireEngine"; import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; import { performCosineSimilarity } from "../../lib/map-cosine"; -import { Logger } from "../../lib/logger"; +import { logger } from "../../lib/logger"; import Redis from "ioredis"; configDotenv(); -const redis = new Redis(process.env.REDIS_URL); +const redis = new Redis(process.env.REDIS_URL!); // Max Links that /map can return const MAX_MAP_LIMIT = 5000; @@ -44,8 +44,12 @@ export async function mapController( const sc: StoredCrawl = { originUrl: req.body.url, - crawlerOptions: legacyCrawlerOptions(req.body), - pageOptions: {}, + crawlerOptions: { + ...req.body, + scrapeOptions: undefined, + }, + scrapeOptions: scrapeOptions.parse({}), + internalOptions: {}, team_id: req.auth.team_id, createdAt: Date.now(), plan: req.auth.plan, @@ -65,8 +69,8 @@ export async function mapController( const cacheKey = `fireEngineMap:${mapUrl}`; const cachedResult = null; - let allResults: any[]; - let pagePromises: Promise[]; + let allResults: any[] = []; + let pagePromises: Promise[] = []; if (cachedResult) { allResults = JSON.parse(cachedResult); @@ -139,7 +143,7 @@ export async function mapController( return null; } }) - .filter((x) => x !== null); + .filter((x) => x !== null) as string[]; // allows for subdomains to be included links = links.filter((x) => isSameDomain(x, req.body.url)); @@ -153,7 +157,7 @@ export async function mapController( links = removeDuplicateUrls(links); billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => { - Logger.error( + logger.error( `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}` ); // Optionally, you could notify an admin or add to a retry queue here @@ -175,9 +179,8 @@ export async function mapController( mode: "map", url: req.body.url, crawlerOptions: {}, - pageOptions: {}, + scrapeOptions: {}, origin: req.body.origin, - extractor_options: { mode: "markdown" }, num_tokens: 0, }); diff --git a/apps/api/src/controllers/v1/scrape-status.ts b/apps/api/src/controllers/v1/scrape-status.ts index 5e0aecb6..db50f7d3 100644 --- a/apps/api/src/controllers/v1/scrape-status.ts +++ b/apps/api/src/controllers/v1/scrape-status.ts @@ -12,7 +12,7 @@ export async function scrapeStatusController(req: any, res: any) { const job = await supabaseGetJobByIdOnlyData(req.params.jobId); - if(job.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){ + if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){ return res.status(403).json({ success: false, error: "You are not allowed to access this resource.", diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 6fa855f7..3f6609af 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -1,10 +1,7 @@ -import { Request, Response } from "express"; -import { Logger } from "../../lib/logger"; +import { Response } from "express"; +import { logger } from "../../lib/logger"; import { Document, - legacyDocumentConverter, - legacyExtractorOptions, - legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, @@ -12,7 +9,6 @@ import { } from "./types"; import { billTeam } from "../../services/billing/credit_billing"; import { v4 as uuidv4 } from "uuid"; -import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; import { logJob } from "../../services/logging/log_job"; import { getJobPriority } from "../../lib/job-priority"; @@ -28,8 +24,6 @@ export async function scrapeController( const origin = req.body.origin; const timeout = req.body.timeout; - const pageOptions = legacyScrapeOptions(req.body); - const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined; const jobId = uuidv4(); const startTime = new Date().getTime(); @@ -43,11 +37,10 @@ export async function scrapeController( { url: req.body.url, mode: "single_urls", - crawlerOptions: {}, team_id: req.auth.team_id, - plan: req.auth.plan, - pageOptions, - extractorOptions, + scrapeOptions: req.body, + internalOptions: {}, + plan: req.auth.plan!, origin: req.body.origin, is_scrape: true, }, @@ -56,13 +49,13 @@ export async function scrapeController( jobPriority ); - const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0); + const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0); - let doc: any | undefined; + let doc: Document; try { - doc = (await waitForJob(jobId, timeout + totalWait))[0]; + doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this } catch (e) { - Logger.error(`Error in scrapeController: ${e}`); + logger.error(`Error in scrapeController: ${e}`); if (e instanceof Error && e.message.startsWith("Job wait")) { return res.status(408).json({ success: false, @@ -71,34 +64,19 @@ export async function scrapeController( } else { return res.status(500).json({ success: false, - error: `(Internal server error) - ${e && e?.message ? e.message : e} ${ - extractorOptions && extractorOptions.mode !== "markdown" - ? " - Could be due to LLM parsing issues" - : "" - }`, + error: `(Internal server error) - ${e && e?.message ? e.message : e}`, }); } } await getScrapeQueue().remove(jobId); - if (!doc) { - console.error("!!! PANIC DOC IS", doc); - return res.status(200).json({ - success: true, - warning: "No page found", - data: doc, - }); - } - - delete doc.index; - delete doc.provider; - const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; const numTokens = - doc && doc.markdown - ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") + doc && doc.extract + // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") + ? 0 // TODO: fix : 0; let creditsToBeBilled = 1; // Assuming 1 credit per document @@ -111,22 +89,16 @@ export async function scrapeController( } billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => { - Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`); + logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`); // Optionally, you could notify an admin or add to a retry queue here }); - if (!pageOptions || !pageOptions.includeRawHtml) { + if (!req.body.formats.includes("rawHtml")) { if (doc && doc.rawHtml) { delete doc.rawHtml; } } - if(pageOptions && pageOptions.includeExtract) { - if(!pageOptions.includeMarkdown && doc && doc.markdown) { - delete doc.markdown; - } - } - logJob({ job_id: jobId, success: true, @@ -137,16 +109,14 @@ export async function scrapeController( team_id: req.auth.team_id, mode: "scrape", url: req.body.url, - crawlerOptions: {}, - pageOptions: pageOptions, + scrapeOptions: req.body, origin: origin, - extractor_options: extractorOptions, num_tokens: numTokens, }); return res.status(200).json({ success: true, - data: legacyDocumentConverter(doc), + data: doc, scrape_id: origin?.includes("website") ? jobId : undefined, }); } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 9585175e..82b24c22 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -1,10 +1,11 @@ import { Request, Response } from "express"; import { z } from "zod"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; -import { Action, ExtractorOptions, PageOptions } from "../../lib/entities"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { PlanType } from "../../types"; import { countries } from "../../lib/validate-country"; +import { ExtractorOptions, PageOptions, ScrapeActionContent, Document as V0Document } from "../../lib/entities"; +import { InternalOptions } from "../../scraper/scrapeURL"; export type Format = | "markdown" @@ -167,6 +168,7 @@ export const scrapeRequestSchema = scrapeOptions.extend({ }); export type ScrapeRequest = z.infer; +export type ScrapeRequestInput = z.input; export const batchScrapeRequestSchema = scrapeOptions.extend({ urls: url.array(), @@ -240,7 +242,7 @@ export const mapRequestSchema = crawlerOptions.extend({ includeSubdomains: z.boolean().default(true), search: z.string().optional(), ignoreSitemap: z.boolean().default(false), - limit: z.number().min(1).max(5000).default(5000).optional(), + limit: z.number().min(1).max(5000).default(5000), }).strict(strictMessage); // export type MapRequest = { @@ -252,13 +254,14 @@ export type MapRequest = z.infer; export type Document = { markdown?: string; - extract?: string; + extract?: any; html?: string; rawHtml?: string; links?: string[]; screenshot?: string; actions?: { - screenshots: string[]; + screenshots?: string[]; + scrapes?: ScrapeActionContent[]; }; warning?: string; metadata: { @@ -291,11 +294,11 @@ export type Document = { publishedTime?: string; articleTag?: string; articleSection?: string; + url?: string; sourceURL?: string; statusCode?: number; error?: string; [key: string]: string | string[] | number | undefined; - }; }; @@ -366,7 +369,7 @@ export type CrawlStatusResponse = type AuthObject = { team_id: string; - plan: PlanType; + plan: PlanType | undefined; }; type Account = { @@ -439,7 +442,7 @@ export interface ResponseWithSentry< sentry?: string, } -export function legacyCrawlerOptions(x: CrawlerOptions) { +export function toLegacyCrawlerOptions(x: CrawlerOptions) { return { includes: x.includePaths, excludes: x.excludePaths, @@ -453,68 +456,90 @@ export function legacyCrawlerOptions(x: CrawlerOptions) { }; } -export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { +export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } { return { - includeMarkdown: x.formats.includes("markdown"), - includeHtml: x.formats.includes("html"), - includeRawHtml: x.formats.includes("rawHtml"), - includeExtract: x.formats.includes("extract"), - onlyIncludeTags: x.includeTags, - removeTags: x.excludeTags, - onlyMainContent: x.onlyMainContent, - waitFor: x.waitFor, - headers: x.headers, - includeLinks: x.formats.includes("links"), - screenshot: x.formats.includes("screenshot"), - fullPageScreenshot: x.formats.includes("screenshot@fullPage"), - parsePDF: x.parsePDF, - actions: x.actions as Action[], // no strict null checking grrrr - mogery - geolocation: x.location ?? x.geolocation, - skipTlsVerification: x.skipTlsVerification, - removeBase64Images: x.removeBase64Images, - mobile: x.mobile, - }; -} - -export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions { - return { - mode: x.mode ? "llm-extraction" : "markdown", - extractionPrompt: x.prompt ?? "Based on the information on the page, extract the information from the schema.", - extractionSchema: x.schema, - userPrompt: x.prompt ?? "", - }; -} - -export function legacyDocumentConverter(doc: any): Document { - if (doc === null || doc === undefined) return null; - - if (doc.metadata) { - if (doc.metadata.screenshot) { - doc.screenshot = doc.metadata.screenshot; - delete doc.metadata.screenshot; - } - - if (doc.metadata.fullPageScreenshot) { - doc.fullPageScreenshot = doc.metadata.fullPageScreenshot; - delete doc.metadata.fullPageScreenshot; - } - } - - return { - markdown: doc.markdown, - links: doc.linksOnPage, - rawHtml: doc.rawHtml, - html: doc.html, - extract: doc.llm_extraction, - screenshot: doc.screenshot ?? doc.fullPageScreenshot, - actions: doc.actions ?? undefined, - warning: doc.warning ?? undefined, - metadata: { - ...doc.metadata, - pageError: undefined, - pageStatusCode: undefined, - error: doc.metadata?.pageError, - statusCode: doc.metadata?.pageStatusCode, + crawlOptions: crawlerOptions.parse({ + includePaths: x.includes, + excludePaths: x.excludes, + limit: x.maxCrawledLinks ?? x.limit, + maxDepth: x.maxDepth, + allowBackwardLinks: x.allowBackwardCrawling, + allowExternalLinks: x.allowExternalContentLinks, + ignoreSitemap: x.ignoreSitemap, + // TODO: returnOnlyUrls support + }), + internalOptions: { + v0CrawlOnlyUrls: x.returnOnlyUrls, }, }; } + +export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } { + return { + scrapeOptions: scrapeOptions.parse({ + formats: [ + (pageOptions.includeMarkdown ?? true) ? "markdown" as const : null, + (pageOptions.includeHtml ?? false) ? "html" as const : null, + (pageOptions.includeRawHtml ?? false) ? "rawHtml" as const : null, + (pageOptions.screenshot ?? false) ? "screenshot" as const : null, + (pageOptions.fullPageScreenshot ?? false) ? "screenshot@fullPage" as const : null, + (extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction")) ? "extract" as const : null, + "links" + ].filter(x => x !== null), + waitFor: pageOptions.waitFor, + headers: pageOptions.headers, + includeTags: (typeof pageOptions.onlyIncludeTags === "string" ? [pageOptions.onlyIncludeTags] : pageOptions.onlyIncludeTags), + excludeTags: (typeof pageOptions.removeTags === "string" ? [pageOptions.removeTags] : pageOptions.removeTags), + onlyMainContent: pageOptions.onlyMainContent ?? false, + timeout: timeout, + parsePDF: pageOptions.parsePDF, + actions: pageOptions.actions, + location: pageOptions.geolocation, + skipTlsVerification: pageOptions.skipTlsVerification, + removeBase64Images: pageOptions.removeBase64Images, + extract: extractorOptions !== undefined && extractorOptions.mode.includes("llm-extraction") ? { + systemPrompt: extractorOptions.extractionPrompt, + prompt: extractorOptions.userPrompt, + schema: extractorOptions.extractionSchema, + } : undefined, + mobile: pageOptions.mobile, + }), + internalOptions: { + atsv: pageOptions.atsv, + v0DisableJsDom: pageOptions.disableJsDom, + v0UseFastMode: pageOptions.useFastMode, + }, + // TODO: fallback, fetchPageContent, replaceAllPathsWithAbsolutePaths, includeLinks + } +} + +export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} { + const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout); + const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions); + return { scrapeOptions, internalOptions: Object.assign(i1, i2) }; +} + +export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } { + if (internalOptions.v0CrawlOnlyUrls) { + return { url: document.metadata.sourceURL! }; + } + + return { + content: document.markdown!, + markdown: document.markdown!, + html: document.html, + rawHtml: document.rawHtml, + linksOnPage: document.links, + llm_extraction: document.extract, + metadata: { + ...document.metadata, + error: undefined, + statusCode: undefined, + pageError: document.metadata.error, + pageStatusCode: document.metadata.statusCode, + screenshot: document.screenshot, + }, + actions: document.actions , + warning: document.warning, + } +} diff --git a/apps/api/src/example.ts b/apps/api/src/example.ts deleted file mode 100644 index edf0faef..00000000 --- a/apps/api/src/example.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { WebScraperDataProvider } from "./scraper/WebScraper"; - -async function example() { - const example = new WebScraperDataProvider(); - - await example.setOptions({ - jobId: "TEST", - mode: "crawl", - urls: ["https://mendable.ai"], - crawlerOptions: {}, - }); - const docs = await example.getDocuments(false); - docs.map((doc) => { - console.log(doc.metadata.sourceURL); - }); - console.log(docs.length); -} - -// example(); diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 5ccbb9cc..7f7ec036 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -6,28 +6,24 @@ import bodyParser from "body-parser"; import cors from "cors"; import { getScrapeQueue } from "./services/queue-service"; import { v0Router } from "./routes/v0"; -import { initSDK } from "@hyperdx/node-opentelemetry"; import os from "os"; -import { Logger } from "./lib/logger"; +import { logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; -import { ScrapeEvents } from "./lib/scrape-events"; import http from 'node:http'; import https from 'node:https'; import CacheableLookup from 'cacheable-lookup'; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; -import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; import { ZodError } from "zod"; import { v4 as uuidv4 } from "uuid"; -import dns from 'node:dns'; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; -Logger.info(`Number of CPUs: ${numCPUs} available`); +logger.info(`Number of CPUs: ${numCPUs} available`); const cacheable = new CacheableLookup() @@ -55,7 +51,6 @@ const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ serverAdapter: serverAdapter, }); - app.use( `/admin/${process.env.BULL_AUTH_KEY}/queues`, serverAdapter.getRouter() @@ -78,15 +73,10 @@ app.use(adminRouter); const DEFAULT_PORT = process.env.PORT ?? 3002; const HOST = process.env.HOST ?? "localhost"; -// HyperDX OpenTelemetry -if (process.env.ENV === "production") { - initSDK({ consoleCapture: true, additionalInstrumentations: [] }); -} - function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { - Logger.info(`Worker ${process.pid} listening on port ${port}`); - Logger.info( + logger.info(`Worker ${process.pid} listening on port ${port}`); + logger.info( `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` ); }); @@ -103,7 +93,6 @@ app.get(`/serverHealthCheck`, async (req, res) => { const [waitingJobs] = await Promise.all([ scrapeQueue.getWaitingCount(), ]); - const noWaitingJobs = waitingJobs === 0; // 200 if no active jobs, 503 if there are active jobs return res.status(noWaitingJobs ? 200 : 500).json({ @@ -111,7 +100,7 @@ app.get(`/serverHealthCheck`, async (req, res) => { }); } catch (error) { Sentry.captureException(error); - Logger.error(error); + logger.error(error); return res.status(500).json({ error: error.message }); } }); @@ -140,7 +129,7 @@ app.get("/serverHealthCheck/notify", async (req, res) => { // Re-check the waiting jobs count after the timeout waitingJobsCount = await getWaitingJobsCount(); if (waitingJobsCount >= treshold) { - const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; + const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL!; const message = { text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${ timeout / 60000 @@ -156,14 +145,14 @@ app.get("/serverHealthCheck/notify", async (req, res) => { }); if (!response.ok) { - Logger.error("Failed to send Slack notification"); + logger.error("Failed to send Slack notification"); } } }, timeout); } } catch (error) { Sentry.captureException(error); - Logger.debug(error); + logger.debug(error); } }; @@ -178,7 +167,7 @@ app.get("/is-production", (req, res) => { app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { if (err instanceof ZodError) { if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) { - Logger.warn("Unsupported protocol error: " + JSON.stringify(req.body)); + logger.warn("Unsupported protocol error: " + JSON.stringify(req.body)); } res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); @@ -206,11 +195,11 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response } } - Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); + logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); }); -Logger.info(`Worker ${process.pid} started`); +logger.info(`Worker ${process.pid} started`); // const sq = getScrapeQueue(); diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index d05f9bd7..430dc1d4 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -4,19 +4,19 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation import { generateOpenAICompletions } from "./models"; import { Document, ExtractorOptions } from "../entities"; -import { Logger } from "../logger"; +import { logger } from "../logger"; // Generate completion using OpenAI export async function generateCompletions( documents: Document[], - extractionOptions: ExtractorOptions, + extractionOptions: ExtractorOptions | undefined, mode: "markdown" | "raw-html" ): Promise { // const schema = zodToJsonSchema(options.schema) - const schema = extractionOptions.extractionSchema; - const systemPrompt = extractionOptions.extractionPrompt; - const prompt = extractionOptions.userPrompt; + const schema = extractionOptions?.extractionSchema; + const systemPrompt = extractionOptions?.extractionPrompt; + const prompt = extractionOptions?.userPrompt; const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider @@ -51,7 +51,7 @@ export async function generateCompletions( return completionResult; } catch (error) { - Logger.error(`Error generating completions: ${error}`); + logger.error(`Error generating completions: ${error}`); throw error; } default: diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 23147b12..f777dce9 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -95,7 +95,7 @@ export async function generateOpenAICompletions({ try { llmExtraction = JSON.parse( - jsonCompletion.choices[0].message.content.trim() + (jsonCompletion.choices[0].message.content ?? "").trim() ); } catch (e) { throw new Error("Invalid JSON"); diff --git a/apps/api/src/lib/batch-process.ts b/apps/api/src/lib/batch-process.ts index 30289fd0..802d1eb1 100644 --- a/apps/api/src/lib/batch-process.ts +++ b/apps/api/src/lib/batch-process.ts @@ -3,7 +3,7 @@ export async function batchProcess( batchSize: number, asyncFunction: (item: T, index: number) => Promise ): Promise { - const batches = []; + const batches: T[][] = []; for (let i = 0; i < array.length; i += batchSize) { const batch = array.slice(i, i + batchSize); batches.push(batch); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 379bc179..41cbb07c 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -1,13 +1,16 @@ +import { InternalOptions } from "../scraper/scrapeURL"; +import { ScrapeOptions } from "../controllers/v1/types"; import { WebCrawler } from "../scraper/WebScraper/crawler"; import { redisConnection } from "../services/queue-service"; -import { Logger } from "./logger"; +import { logger } from "./logger"; export type StoredCrawl = { originUrl?: string; crawlerOptions: any; - pageOptions: any; + scrapeOptions: Omit; + internalOptions: InternalOptions; team_id: string; - plan: string; + plan?: string; robots?: string; cancelled?: boolean; createdAt: number; @@ -100,7 +103,7 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise urlO.hash = ""; url = urlO.href; } catch (error) { - Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); + logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); } const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 @@ -117,7 +120,7 @@ export async function lockURLs(id: string, urls: string[]): Promise { urlO.hash = ""; return urlO.href; } catch (error) { - Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); + logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); } return url; @@ -131,7 +134,7 @@ export async function lockURLs(id: string, urls: string[]): Promise { export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { const crawler = new WebCrawler({ jobId: id, - initialUrl: sc.originUrl, + initialUrl: sc.originUrl!, includes: sc.crawlerOptions?.includes ?? [], excludes: sc.crawlerOptions?.excludes ?? [], maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 2c8f376d..9b68f425 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -1,3 +1,5 @@ +import type { Document as V1Document } from "../controllers/v1/types"; + export interface Progress { current: number; total: number; @@ -129,7 +131,8 @@ export class Document { provider?: string; warning?: string; actions?: { - screenshots: string[]; + screenshots?: string[]; + scrapes?: ScrapeActionContent[]; } index?: number; diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 396b7fe7..8800d916 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -5,7 +5,7 @@ import "../services/sentry" import * as Sentry from "@sentry/node"; import dotenv from 'dotenv'; -import { Logger } from './logger'; +import { logger } from './logger'; dotenv.config(); // TODO: add a timeout to the Go parser @@ -40,7 +40,7 @@ class GoMarkdownConverter { } } -export async function parseMarkdown(html: string): Promise { +export async function parseMarkdown(html: string | null | undefined): Promise { if (!html) { return ''; } @@ -52,12 +52,12 @@ export async function parseMarkdown(html: string): Promise { markdownContent = processMultiLineLinks(markdownContent); markdownContent = removeSkipToContentLinks(markdownContent); - Logger.info(`HTML to Markdown conversion using Go parser successful`); + logger.info(`HTML to Markdown conversion using Go parser successful`); return markdownContent; } } catch (error) { Sentry.captureException(error); - Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`); + logger.error(`Error converting HTML to Markdown with Go parser: ${error}`); } // Fallback to TurndownService if Go parser fails or is not enabled diff --git a/apps/api/src/lib/job-priority.ts b/apps/api/src/lib/job-priority.ts index 9d046052..27e45230 100644 --- a/apps/api/src/lib/job-priority.ts +++ b/apps/api/src/lib/job-priority.ts @@ -1,6 +1,6 @@ import { redisConnection } from "../../src/services/queue-service"; import { PlanType } from "../../src/types"; -import { Logger } from "./logger"; +import { logger } from "./logger"; const SET_KEY_PREFIX = "limit_team_id:"; export async function addJobPriority(team_id, job_id) { @@ -13,7 +13,7 @@ export async function addJobPriority(team_id, job_id) { // This approach will reset the expiration time to 60 seconds every time a new job is added to the set. await redisConnection.expire(setKey, 60); } catch (e) { - Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`); + logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`); } } @@ -24,7 +24,7 @@ export async function deleteJobPriority(team_id, job_id) { // remove job_id from the set await redisConnection.srem(setKey, job_id); } catch (e) { - Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`); + logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`); } } @@ -33,7 +33,7 @@ export async function getJobPriority({ team_id, basePriority = 10, }: { - plan: PlanType; + plan: PlanType | undefined; team_id: string; basePriority?: number; }): Promise { @@ -95,7 +95,7 @@ export async function getJobPriority({ ); } } catch (e) { - Logger.error( + logger.error( `Get job priority failed: ${team_id}, ${plan}, ${basePriority}` ); return basePriority; diff --git a/apps/api/src/lib/load-testing-example.ts b/apps/api/src/lib/load-testing-example.ts deleted file mode 100644 index 01b61db9..00000000 --- a/apps/api/src/lib/load-testing-example.ts +++ /dev/null @@ -1,42 +0,0 @@ -// import { scrapWithFireEngine } from "../../src/scraper/WebScraper/single_url"; - -// const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); - -// const scrapInBatches = async ( -// urls: string[], -// batchSize: number, -// delayMs: number -// ) => { -// let successCount = 0; -// let errorCount = 0; - -// for (let i = 0; i < urls.length; i += batchSize) { -// const batch = urls -// .slice(i, i + batchSize) -// .map((url) => scrapWithFireEngine(url)); -// try { -// const results = await Promise.all(batch); -// results.forEach((data, index) => { -// if (data.trim() === "") { -// errorCount++; -// } else { -// successCount++; -// console.log( -// `Scraping result ${i + index + 1}:`, -// data.trim().substring(0, 20) + "..." -// ); -// } -// }); -// } catch (error) { -// console.error("Error during scraping:", error); -// } -// await delay(delayMs); -// } - -// console.log(`Total successful scrapes: ${successCount}`); -// console.log(`Total errored scrapes: ${errorCount}`); -// }; -// function run() { -// const urls = Array.from({ length: 200 }, () => "https://scrapethissite.com"); -// scrapInBatches(urls, 10, 1000); -// } diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts index 7eca1ef0..33aa949b 100644 --- a/apps/api/src/lib/logger.ts +++ b/apps/api/src/lib/logger.ts @@ -1,57 +1,82 @@ +import * as winston from "winston"; + import { configDotenv } from "dotenv"; +import Transport from "winston-transport"; configDotenv(); -enum LogLevel { - NONE = 'NONE', // No logs will be output. - ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation. - WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors. - INFO = 'INFO', // For logging informational messages that highlight the progress of the application. - DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging. - TRACE = 'TRACE' // For logging more detailed information than the DEBUG level. -} -export class Logger { - static colors = { - ERROR: '\x1b[31m%s\x1b[0m', // Red - WARN: '\x1b[33m%s\x1b[0m', // Yellow - INFO: '\x1b[34m%s\x1b[0m', // Blue - DEBUG: '\x1b[36m%s\x1b[0m', // Cyan - TRACE: '\x1b[35m%s\x1b[0m' // Magenta - }; - - static log (message: string, level: LogLevel) { - const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.TRACE; - const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE]; - const currentLevelIndex = levels.indexOf(logLevel); - const messageLevelIndex = levels.indexOf(level); - - if (currentLevelIndex >= messageLevelIndex) { - const color = Logger.colors[level]; - console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`); - - // const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; - // if (useDbAuthentication) { - // save to supabase? another place? - // supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean }); - // } +const logFormat = winston.format.printf(info => + `${info.timestamp} ${info.level} [${info.metadata.module ?? ""}:${info.metadata.method ?? ""}]: ${info.message} ${info.level.includes("error") || info.level.includes("warn") ? JSON.stringify( + info.metadata, + (_, value) => { + if (value instanceof Error) { + return { + ...value, + name: value.name, + message: value.message, + stack: value.stack, + cause: value.cause, + } + } else { + return value; + } } - } - static error(message: string | any) { - Logger.log(message, LogLevel.ERROR); + ) : ""}` +) + +export const logger = winston.createLogger({ + level: process.env.LOGGING_LEVEL?.toLowerCase() ?? "debug", + format: winston.format.json({ + replacer(key, value) { + if (value instanceof Error) { + return { + ...value, + name: value.name, + message: value.message, + stack: value.stack, + cause: value.cause, + } + } else { + return value; + } + } + }), + transports: [ + new winston.transports.Console({ + format: winston.format.combine( + winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }), + winston.format.metadata({ fillExcept: ["message", "level", "timestamp"] }), + ...(((process.env.ENV === "production" && process.env.SENTRY_ENVIRONMENT === "dev") || (process.env.ENV !== "production")) ? [winston.format.colorize(), logFormat] : []), + ), + }), + ], +}); + +export type ArrayTransportOptions = Transport.TransportStreamOptions & { + array: any[]; + scrapeId?: string; +}; + +export class ArrayTransport extends Transport { + private array: any[]; + private scrapeId?: string; + + constructor(opts: ArrayTransportOptions) { + super(opts); + this.array = opts.array; + this.scrapeId = opts.scrapeId; } - static warn(message: string) { - Logger.log(message, LogLevel.WARN); - } + log(info, next) { + setImmediate(() => { + this.emit("logged", info); + }); - static info(message: string) { - Logger.log(message, LogLevel.INFO); - } + if (this.scrapeId !== undefined && info.scrapeId !== this.scrapeId) { + return next(); + } - static debug(message: string) { - Logger.log(message, LogLevel.DEBUG); - } + this.array.push(info); - static trace(message: string) { - Logger.log(message, LogLevel.TRACE); + next(); } -} +} \ No newline at end of file diff --git a/apps/api/src/lib/map-cosine.ts b/apps/api/src/lib/map-cosine.ts index db2491a9..2a089548 100644 --- a/apps/api/src/lib/map-cosine.ts +++ b/apps/api/src/lib/map-cosine.ts @@ -1,4 +1,4 @@ -import { Logger } from "./logger"; +import { logger } from "./logger"; export function performCosineSimilarity(links: string[], searchQuery: string) { try { @@ -40,7 +40,7 @@ export function performCosineSimilarity(links: string[], searchQuery: string) { links = a.map((item) => item.link); return links; } catch (error) { - Logger.error(`Error performing cosine similarity: ${error}`); + logger.error(`Error performing cosine similarity: ${error}`); return links; } } diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index ad70dfef..83873a58 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -1,8 +1,8 @@ import { Job } from "bullmq"; -import type { baseScrapers } from "../scraper/WebScraper/single_url"; import { supabase_service as supabase } from "../services/supabase"; -import { Logger } from "./logger"; +import { logger } from "./logger"; import { configDotenv } from "dotenv"; +import { Engine } from "../scraper/scrapeURL/engines"; configDotenv(); export type ScrapeErrorEvent = { @@ -15,7 +15,7 @@ export type ScrapeScrapeEvent = { type: "scrape", url: string, worker?: string, - method: (typeof baseScrapers)[number], + method: Engine, result: null | { success: boolean, response_code?: number, @@ -49,7 +49,7 @@ export class ScrapeEvents { }).select().single(); return (result.data as any).id; } catch (error) { - // Logger.error(`Error inserting scrape event: ${error}`); + // logger.error(`Error inserting scrape event: ${error}`); return null; } } @@ -69,7 +69,7 @@ export class ScrapeEvents { } }).eq("id", logId); } catch (error) { - Logger.error(`Error updating scrape result: ${error}`); + logger.error(`Error updating scrape result: ${error}`); } } @@ -81,7 +81,7 @@ export class ScrapeEvents { worker: process.env.FLY_MACHINE_ID, }); } catch (error) { - Logger.error(`Error logging job event: ${error}`); + logger.error(`Error logging job event: ${error}`); } } } diff --git a/apps/api/src/lib/supabase-jobs.ts b/apps/api/src/lib/supabase-jobs.ts index c418a6e0..c9be72a3 100644 --- a/apps/api/src/lib/supabase-jobs.ts +++ b/apps/api/src/lib/supabase-jobs.ts @@ -1,5 +1,5 @@ import { supabase_service } from "../services/supabase"; -import { Logger } from "./logger"; +import { logger } from "./logger"; import * as Sentry from "@sentry/node"; /** @@ -37,7 +37,7 @@ export const supabaseGetJobsById = async (jobIds: string[]) => { .in("job_id", jobIds); if (error) { - Logger.error(`Error in supabaseGetJobsById: ${error}`); + logger.error(`Error in supabaseGetJobsById: ${error}`); Sentry.captureException(error); return []; } @@ -61,7 +61,7 @@ export const supabaseGetJobsByCrawlId = async (crawlId: string) => { .eq("crawl_id", crawlId) if (error) { - Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`); + logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`); Sentry.captureException(error); return []; } diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index b45b8973..a6cd539d 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -1,30 +1,25 @@ import { AuthResponse } from "../../src/types"; -import { Logger } from "./logger"; +import { logger } from "./logger"; import * as Sentry from "@sentry/node"; import { configDotenv } from "dotenv"; configDotenv(); let warningCount = 0; -export function withAuth( - originalFunction: (...args: U) => Promise +export function withAuth( + originalFunction: (...args: U) => Promise, + mockSuccess: T, ) { return async function (...args: U): Promise { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; if (!useDbAuthentication) { if (warningCount < 5) { - Logger.warn("You're bypassing authentication"); + logger.warn("You're bypassing authentication"); warningCount++; } return { success: true } as T; } else { - try { - return await originalFunction(...args); - } catch (error) { - Sentry.captureException(error); - Logger.error(`Error in withAuth function: ${error}`); - return { success: false, error: error.message } as T; - } + return await originalFunction(...args); } }; } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 8bd0c12c..90d4a47f 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -1,151 +1,127 @@ import { Job } from "bullmq"; import { - CrawlResult, WebScraperOptions, RunWebScraperParams, RunWebScraperResult, } from "../types"; -import { WebScraperDataProvider } from "../scraper/WebScraper"; -import { DocumentUrl, Progress } from "../lib/entities"; import { billTeam } from "../services/billing/credit_billing"; -import { Document } from "../lib/entities"; +import { Document } from "../controllers/v1/types"; import { supabase_service } from "../services/supabase"; -import { Logger } from "../lib/logger"; +import { logger } from "../lib/logger"; import { ScrapeEvents } from "../lib/scrape-events"; import { configDotenv } from "dotenv"; +import { EngineResultsTracker, scrapeURL, ScrapeUrlResponse } from "../scraper/scrapeURL"; +import { Engine } from "../scraper/scrapeURL/engines"; configDotenv(); export async function startWebScraperPipeline({ job, token, }: { - job: Job; + job: Job & { id: string }; token: string; }) { - let partialDocs: Document[] = []; return (await runWebScraper({ url: job.data.url, mode: job.data.mode, - crawlerOptions: job.data.crawlerOptions, - extractorOptions: job.data.extractorOptions, - pageOptions: { - ...job.data.pageOptions, + scrapeOptions: { + ...job.data.scrapeOptions, ...(job.data.crawl_id ? ({ - includeRawHtml: true, + formats: job.data.scrapeOptions.formats.concat(["rawHtml"]), }): {}), }, - inProgress: (progress) => { - Logger.debug(`🐂 Job in progress ${job.id}`); - if (progress.currentDocument) { - partialDocs.push(progress.currentDocument); - if (partialDocs.length > 50) { - partialDocs = partialDocs.slice(-50); - } - // job.updateProgress({ ...progress, partialDocs: partialDocs }); - } - }, - onSuccess: (result, mode) => { - Logger.debug(`🐂 Job completed ${job.id}`); - saveJob(job, result, token, mode); - }, - onError: (error) => { - Logger.error(`🐂 Job failed ${job.id}`); - ScrapeEvents.logJobEvent(job, "failed"); - job.moveToFailed(error, token, false); - }, + internalOptions: job.data.internalOptions, + // onSuccess: (result, mode) => { + // logger.debug(`🐂 Job completed ${job.id}`); + // saveJob(job, result, token, mode); + // }, + // onError: (error) => { + // logger.error(`🐂 Job failed ${job.id}`); + // ScrapeEvents.logJobEvent(job, "failed"); + // }, team_id: job.data.team_id, bull_job_id: job.id.toString(), priority: job.opts.priority, is_scrape: job.data.is_scrape ?? false, - })) as { success: boolean; message: string; docs: Document[] }; + })); } export async function runWebScraper({ url, mode, - crawlerOptions, - pageOptions, - extractorOptions, - inProgress, - onSuccess, - onError, + scrapeOptions, + internalOptions, + // onSuccess, + // onError, team_id, bull_job_id, priority, is_scrape=false, -}: RunWebScraperParams): Promise { +}: RunWebScraperParams): Promise { + let response: ScrapeUrlResponse | undefined = undefined; + let engines: EngineResultsTracker = {}; try { - const provider = new WebScraperDataProvider(); - if (mode === "crawl") { - await provider.setOptions({ - jobId: bull_job_id, - mode: mode, - urls: [url], - extractorOptions, - crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - bullJobId: bull_job_id, - priority, - }); - } else { - await provider.setOptions({ - jobId: bull_job_id, - mode: mode, - urls: url.split(","), - extractorOptions, - crawlerOptions: crawlerOptions, - pageOptions: pageOptions, - priority, - teamId: team_id - }); + response = await scrapeURL(bull_job_id, url, scrapeOptions, { priority, ...internalOptions }); + if (!response.success) { + if (response.error instanceof Error) { + throw response.error; + } else { + throw new Error("scrapeURL error: " + (Array.isArray(response.error) ? JSON.stringify(response.error) : typeof response.error === "object" ? JSON.stringify({ ...response.error }) : response.error)); + } } - const docs = (await provider.getDocuments(false, (progress: Progress) => { - inProgress(progress); - })) as Document[]; - - if (docs.length === 0) { - return { - success: true, - message: "No pages found", - docs: [], - }; - } - - // remove docs with empty content - const filteredDocs = crawlerOptions?.returnOnlyUrls - ? docs.map((doc) => { - if (doc.metadata.sourceURL) { - return { url: doc.metadata.sourceURL }; - } - }) - : docs; if(is_scrape === false) { let creditsToBeBilled = 1; // Assuming 1 credit per document - if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) { + if (scrapeOptions.extract) { creditsToBeBilled = 5; } - billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => { - Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`); + billTeam(team_id, undefined, creditsToBeBilled).catch(error => { + logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`); // Optionally, you could notify an admin or add to a retry queue here }); } - - // This is where the returnvalue from the job is set - onSuccess(filteredDocs, mode); + // onSuccess(response.document, mode); - // this return doesn't matter too much for the job completion result - return { success: true, message: "", docs: filteredDocs }; + engines = response.engines; + return response; } catch (error) { - onError(error); - return { success: false, message: error.message, docs: [] }; + engines = response !== undefined ? response.engines : ((typeof error === "object" && error !== null ? (error as any).results ?? {} : {})); + + if (response !== undefined) { + return { + ...response, + success: false, + error, + } + } else { + return { success: false, error, logs: ["no logs -- error coming from runWebScraper"], engines }; + } + // onError(error); + } finally { + const engineOrder = Object.entries(engines).sort((a, b) => a[1].startedAt - b[1].startedAt).map(x => x[0]) as Engine[]; + + for (const engine of engineOrder) { + const result = engines[engine] as Exclude; + ScrapeEvents.insert(bull_job_id, { + type: "scrape", + url, + method: engine, + result: { + success: result.state === "success", + response_code: (result.state === "success" ? result.result.statusCode : undefined), + response_size: (result.state === "success" ? result.result.html.length : undefined), + error: (result.state === "error" ? result.error : result.state === "timeout" ? "Timed out" : undefined), + time_taken: result.finishedAt - result.startedAt, + }, + }); + } } } -const saveJob = async (job: Job, result: any, token: string, mode: string) => { +const saveJob = async (job: Job, result: any, token: string, mode: string, engines?: EngineResultsTracker) => { try { const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; if (useDbAuthentication) { @@ -173,6 +149,6 @@ const saveJob = async (job: Job, result: any, token: string, mode: string) => { } ScrapeEvents.logJobEvent(job, "completed"); } catch (error) { - Logger.error(`🐂 Failed to update job status: ${error}`); + logger.error(`🐂 Failed to update job status: ${error}`); } }; diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index 88159060..ac61519a 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -6,8 +6,8 @@ import { cleanBefore24hCompleteJobsController, queuesController, } from "../controllers/v0/admin/queue"; -import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear"; import { wrap } from "./v1"; +import { acucCacheClearController } from "../controllers/v0/admin/acuc-cache-clear"; export const adminRouter = express.Router(); diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 4e4b6052..3eaace3b 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -14,7 +14,7 @@ import expressWs from "express-ws"; import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { crawlCancelController } from "../controllers/v1/crawl-cancel"; -import { Logger } from "../lib/logger"; +import { logger } from "../lib/logger"; import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; import { batchScrapeController } from "../controllers/v1/batch-scrape"; @@ -32,10 +32,12 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R if (!minimum && req.body) { minimum = (req.body as any)?.limit ?? (req.body as any)?.urls?.length ?? 1; } - const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum); - req.acuc = chunk; + const { success, remainingCredits, chunk } = await checkTeamCredits(req.acuc, req.auth.team_id, minimum ?? 1); + if (chunk) { + req.acuc = chunk; + } if (!success) { - Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); + logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); if (!res.headersSent) { return res.status(402).json({ success: false, error: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value." }); } @@ -50,20 +52,27 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void { return (req, res, next) => { (async () => { - const { success, team_id, error, status, plan, chunk } = await authenticateUser( + const auth = await authenticateUser( req, res, rateLimiterMode, ); - if (!success) { + if (!auth.success) { if (!res.headersSent) { - return res.status(status).json({ success: false, error }); + return res.status(auth.status).json({ success: false, error: auth.error }); + } else { + return; } } + const { team_id, plan, chunk } = auth; + req.auth = { team_id, plan }; - req.acuc = chunk; + req.acuc = chunk ?? undefined; + if (chunk) { + req.account = { remainingCredits: chunk.remaining_credits }; + } next(); })() .catch(err => next(err)); diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts index 20419ffa..eba0ddb4 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -2,7 +2,6 @@ import { WebCrawler } from '../crawler'; import axios from 'axios'; import robotsParser from 'robots-parser'; -import { getAdjustedMaxDepth } from '../utils/maxDepthUtils'; jest.mock('axios'); jest.mock('robots-parser'); @@ -35,165 +34,6 @@ describe('WebCrawler', () => { }); }); - it('should filter out links that exceed maxDepth param of 2 based on enterURL depth of 0 ', async () => { - const initialUrl = 'http://example.com'; // Set initial URL for this test - const enteredMaxCrawledDepth = 2; - maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); - - - crawler = new WebCrawler({ - jobId: "TEST", - initialUrl: initialUrl, - includes: [], - excludes: [], - limit: 100, - maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing - }); - - // Mock sitemap fetching function to return controlled links - crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ - initialUrl, // depth 0 - initialUrl + '/page1', // depth 1 - initialUrl + '/page1/page2', // depth 2 - initialUrl + '/page1/page2/page3' // depth 3, should be filtered out - ]); - - const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth); - expect(results).toEqual([ - { url: initialUrl, html: '' }, - { url: initialUrl + '/page1', html: '' }, - { url: initialUrl + '/page1/page2', html: '' } - ]); - - - // Ensure that the link with depth 3 is not included - expect(results.some(r => r.url === initialUrl + '/page1/page2/page3')).toBe(false); - }); - - it('should filter out links that exceed maxDepth param of 0 based on enterURL depth of 0 ', async () => { - const initialUrl = 'http://example.com'; // Set initial URL for this test - const enteredMaxCrawledDepth = 0; - maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); - - - crawler = new WebCrawler({ - jobId: "TEST", - initialUrl: initialUrl, - includes: [], - excludes: [], - limit: 100, - maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing - }); - - // Mock sitemap fetching function to return controlled links - crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ - initialUrl, // depth 0 - initialUrl + '/page1', // depth 1 - initialUrl + '/page1/page2', // depth 2 - initialUrl + '/page1/page2/page3' // depth 3, should be filtered out - ]); - - const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth); - expect(results).toEqual([ - { url: initialUrl, html: '' }, - ]); - }); - - it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 1 ', async () => { - const initialUrl = 'http://example.com/page1'; // Set initial URL for this test - const enteredMaxCrawledDepth = 1; - maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); - - - crawler = new WebCrawler({ - jobId: "TEST", - initialUrl: initialUrl, - includes: [], - excludes: [], - limit: 100, - maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing - }); - - // Mock sitemap fetching function to return controlled links - crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ - initialUrl, // depth 0 - initialUrl + '/page2', // depth 1 - initialUrl + '/page2/page3', // depth 2 - initialUrl + '/page2/page3/page4' // depth 3, should be filtered out - ]); - - const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth); - expect(results).toEqual([ - { url: initialUrl, html: '' }, - { url: initialUrl + '/page2', html: '' } - ]); - }); - - it('should filter out links that exceed maxDepth param of 1 based on enterURL depth of 2 ', async () => { - const initialUrl = 'http://example.com/page1'; // Set initial URL for this test - const enteredMaxCrawledDepth = 2; - maxCrawledDepth = getAdjustedMaxDepth(initialUrl, enteredMaxCrawledDepth); - - - crawler = new WebCrawler({ - jobId: "TEST", - initialUrl: initialUrl, - includes: [], - excludes: [], - limit: 100, - maxCrawledDepth: maxCrawledDepth, // Set maxDepth for testing - }); - - // Mock sitemap fetching function to return controlled links - crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ - initialUrl, // depth 0 - initialUrl + '/page2', // depth 1 - initialUrl + '/page2/page3', // depth 2 - initialUrl + '/page2/page3/page4' // depth 3, should be filtered out - ]); - - const results = await crawler.start(undefined, undefined, undefined, undefined, undefined, maxCrawledDepth); - expect(results).toEqual([ - { url: initialUrl, html: '' }, - { url: initialUrl + '/page2', html: '' }, - { url: initialUrl + '/page2/page3', html: '' } - ]); - }); - - it('should handle allowBackwardCrawling option correctly', async () => { - const initialUrl = 'https://mendable.ai/blog'; - - // Setup the crawler with the specific test case options - const crawler = new WebCrawler({ - jobId: "TEST", - initialUrl: initialUrl, - includes: [], - excludes: [], - limit: 100, - maxCrawledDepth: 3, // Example depth - allowBackwardCrawling: true - }); - - // Mock the sitemap fetching function to simulate backward crawling - crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([ - initialUrl, - 'https://mendable.ai', // backward link - initialUrl + '/page1', - initialUrl + '/page1/page2' - ]); - - const results = await crawler.start(); - expect(results).toEqual([ - { url: initialUrl, html: '' }, - { url: 'https://mendable.ai', html: '' }, // Expect the backward link to be included - { url: initialUrl + '/page1', html: '' }, - { url: initialUrl + '/page1/page2', html: '' } - ]); - - // Check that the backward link is included if allowBackwardCrawling is true - expect(results.some(r => r.url === 'https://mendable.ai')).toBe(true); - }); - it('should respect the limit parameter by not returning more links than specified', async () => { const initialUrl = 'http://example.com'; const limit = 2; // Set a limit for the number of links diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts deleted file mode 100644 index 02c8a7e0..00000000 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { scrapSingleUrl } from '../single_url'; -import { PageOptions } from '../../../lib/entities'; - - -jest.mock('../single_url', () => { - const originalModule = jest.requireActual('../single_url'); - originalModule.fetchHtmlContent = jest.fn().mockResolvedValue('Test

Roast

'); - - return originalModule; -}); - -describe('scrapSingleUrl', () => { - it('should handle includeHtml option correctly', async () => { - const url = 'https://roastmywebsite.ai'; - const pageOptionsWithHtml: PageOptions = { includeHtml: true }; - const pageOptionsWithoutHtml: PageOptions = { includeHtml: false }; - - const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml); - const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml); - - expect(resultWithHtml.html).toBeDefined(); - expect(resultWithoutHtml.html).toBeUndefined(); - }, 10000); -}); - -it('should return a list of links on the firecrawl.ai page', async () => { - const url = 'https://flutterbricks.com'; - const pageOptions: PageOptions = { includeHtml: true }; - - const result = await scrapSingleUrl("TEST", url, pageOptions); - - // Check if the result contains a list of links - expect(result.linksOnPage).toBeDefined(); - expect(Array.isArray(result.linksOnPage)).toBe(true); - expect(result.linksOnPage.length).toBeGreaterThan(0); - expect(result.linksOnPage).toContain('https://flutterbricks.com/features') -}, 15000); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 1ae166b4..9e3f7cd2 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -2,13 +2,10 @@ import axios, { AxiosError } from "axios"; import cheerio, { load } from "cheerio"; import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; -import async from "async"; -import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; -import { scrapSingleUrl } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; -import { Logger } from "../../../src/lib/logger"; +import { logger } from "../../../src/lib/logger"; import https from "https"; export class WebCrawler { private jobId: string; @@ -73,7 +70,7 @@ export class WebCrawler { try { url = new URL(link.trim(), this.baseUrl); } catch (error) { - Logger.debug(`Error processing link: ${link} | Error: ${error.message}`); + logger.debug(`Error processing link: ${link} | Error: ${error.message}`); return false; } const path = url.pathname; @@ -132,7 +129,7 @@ export class WebCrawler { const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; // Check if the link is disallowed by robots.txt if (!isAllowed) { - Logger.debug(`Link disallowed by robots.txt: ${link}`); + logger.debug(`Link disallowed by robots.txt: ${link}`); return false; } @@ -161,7 +158,7 @@ export class WebCrawler { } public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> { - Logger.debug(`Fetching sitemap links from ${this.initialUrl}`); + logger.debug(`Fetching sitemap links from ${this.initialUrl}`); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); @@ -170,115 +167,6 @@ export class WebCrawler { return null; } - public async start( - inProgress?: (progress: Progress) => void, - pageOptions?: PageOptions, - crawlerOptions?: CrawlerOptions, - concurrencyLimit: number = 5, - limit: number = 10000, - maxDepth: number = 10 - ): Promise<{ url: string, html: string }[]> { - - Logger.debug(`Crawler starting with ${this.initialUrl}`); - // Fetch and parse robots.txt - try { - const txt = await this.getRobotsTxt(); - this.importRobotsTxt(txt); - Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`); - } catch (error) { - Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); - } - - if (!crawlerOptions?.ignoreSitemap){ - const sm = await this.tryGetSitemap(); - if (sm !== null) { - return sm; - } - } - - const urls = await this.crawlUrls( - [this.initialUrl], - pageOptions, - concurrencyLimit, - inProgress - ); - - if ( - urls.length === 0 && - this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 - ) { - return [{ url: this.initialUrl, html: "" }]; - } - - // make sure to run include exclude here again - const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); - return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); - } - - private async crawlUrls( - urls: string[], - pageOptions: PageOptions, - concurrencyLimit: number, - inProgress?: (progress: Progress) => void, - ): Promise<{ url: string, html: string }[]> { - const queue = async.queue(async (task: string, callback) => { - Logger.debug(`Crawling ${task}`); - if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) { - if (callback && typeof callback === "function") { - callback(); - } - return; - } - const newUrls = await this.crawl(task, pageOptions); - // add the initial url if not already added - // if (this.visited.size === 1) { - // let normalizedInitial = this.initialUrl; - // if (!normalizedInitial.endsWith("/")) { - // normalizedInitial = normalizedInitial + "/"; - // } - // if (!newUrls.some(page => page.url === this.initialUrl)) { - // newUrls.push({ url: this.initialUrl, html: "" }); - // } - // } - - newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); - - if (inProgress && newUrls.length > 0) { - inProgress({ - current: this.crawledUrls.size, - total: Math.min(this.maxCrawledLinks, this.limit), - status: "SCRAPING", - currentDocumentUrl: newUrls[newUrls.length - 1].url, - }); - } else if (inProgress) { - inProgress({ - current: this.crawledUrls.size, - total: Math.min(this.maxCrawledLinks, this.limit), - status: "SCRAPING", - currentDocumentUrl: task, - }); - } - await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress); - if (callback && typeof callback === "function") { - callback(); - } - }, concurrencyLimit); - - Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`); - queue.push( - urls.filter( - (url) => - !this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent") - ), - (err) => { - if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`); - } - ); - await queue.drain(); - Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`); - return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); - } - public filterURL(href: string, url: string): string | null { let fullUrl = href; if (!href.startsWith("http")) { @@ -346,79 +234,9 @@ export class WebCrawler { return links; } - async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> { - if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) { - return []; - } - this.visited.add(url); - - if (!url.startsWith("http")) { - url = "https://" + url; - } - if (url.endsWith("/")) { - url = url.slice(0, -1); - } - - if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { - return []; - } - - try { - let content: string = ""; - let pageStatusCode: number; - let pageError: string | undefined = undefined; - - // If it is the first link, fetch with single url - if (this.visited.size === 1) { - const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true }); - content = page.html ?? ""; - pageStatusCode = page.metadata?.pageStatusCode; - pageError = page.metadata?.pageError || undefined; - } else { - const response = await axios.get(url, { timeout: axiosTimeout }); - content = response.data ?? ""; - pageStatusCode = response.status; - pageError = response.statusText != "OK" ? response.statusText : undefined; - } - - const $ = load(content); - let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; - - // Add the initial URL to the list of links - if (this.visited.size === 1) { - links.push({ url, html: content, pageStatusCode, pageError }); - } - - links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError }))); - - if (this.visited.size === 1) { - return links; - } - - // Create a new list to return to avoid modifying the visited list - return links.filter((link) => !this.visited.has(link.url)); - } catch (error) { - return []; - } - } - private isRobotsAllowed(url: string): boolean { return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true) } - private normalizeCrawlUrl(url: string): string { - try{ - const urlObj = new URL(url); - urlObj.searchParams.sort(); // Sort query parameters to normalize - return urlObj.toString(); - } catch (error) { - return url; - } - } - - private matchesIncludes(url: string): boolean { - if (this.includes.length === 0 || this.includes[0] == "") return true; - return this.includes.some((pattern) => new RegExp(pattern).test(url)); - } private matchesExcludes(url: string, onlyDomains: boolean = false): boolean { return this.excludes.some((pattern) => { @@ -503,7 +321,7 @@ export class WebCrawler { const urlWithoutQuery = url.split('?')[0].toLowerCase(); return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext)); } catch (error) { - Logger.error(`Error processing URL in isFile: ${error}`); + logger.error(`Error processing URL in isFile: ${error}`); return false; } } @@ -524,7 +342,6 @@ export class WebCrawler { return socialMediaOrEmail.some((ext) => url.includes(ext)); } - // private async tryFetchSitemapLinks(url: string): Promise { const normalizeUrl = (url: string) => { url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); @@ -546,7 +363,7 @@ export class WebCrawler { sitemapLinks = await getLinksFromSitemap({ sitemapUrl }); } } catch (error) { - Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); + logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); if (error instanceof AxiosError && error.response?.status === 404) { // ignore 404 } else { @@ -565,7 +382,7 @@ export class WebCrawler { sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); } } catch (error) { - Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); if (error instanceof AxiosError && error.response?.status === 404) { // ignore 404 } else { diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index e5841978..48aa2ffd 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -1,4 +1,4 @@ -import { Logger } from "../../../lib/logger"; +import { logger } from "../../../lib/logger"; export async function handleCustomScraping( text: string, @@ -6,7 +6,7 @@ export async function handleCustomScraping( ): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> { // Check for Readme Docs special case if (text.includes(' { - throw new Error("Method not implemented."); - } - - private async convertUrlsToDocuments( - urls: string[], - inProgress?: (progress: Progress) => void, - allHtmls?: string[] - ): Promise { - const totalUrls = urls.length; - let processedUrls = 0; - - const results: (Document | null)[] = new Array(urls.length).fill(null); - for (let i = 0; i < urls.length; i += this.concurrentRequests) { - const batchUrls = urls.slice(i, i + this.concurrentRequests); - await Promise.all( - batchUrls.map(async (url, index) => { - const existingHTML = allHtmls ? allHtmls[i + index] : ""; - const result = await scrapSingleUrl( - this.jobId, - url, - this.pageOptions, - this.extractorOptions, - existingHTML, - this.priority, - this.teamId, - ); - processedUrls++; - if (inProgress) { - inProgress({ - current: processedUrls, - total: totalUrls, - status: "SCRAPING", - currentDocumentUrl: url, - currentDocument: { ...result, index: processedUrls }, - }); - } - - results[i + index] = result; - }) - ); - } - return results.filter((result) => result !== null) as Document[]; - } - - async getDocuments( - useCaching: boolean = false, - inProgress?: (progress: Progress) => void - ): Promise { - this.validateInitialUrl(); - if (!useCaching) { - return this.processDocumentsWithoutCache(inProgress); - } - - return this.processDocumentsWithCache(inProgress); - } - - private validateInitialUrl(): void { - if (this.urls[0].trim() === "") { - throw new Error("Url is required"); - } - } - - /** - * Process documents without cache handling each mode - * @param inProgress inProgress - * @returns documents - */ - private async processDocumentsWithoutCache( - inProgress?: (progress: Progress) => void - ): Promise { - switch (this.mode) { - case "crawl": - return this.handleCrawlMode(inProgress); - case "single_urls": - return this.handleSingleUrlsMode(inProgress); - case "sitemap": - return this.handleSitemapMode(inProgress); - default: - return []; - } - } - - private async cleanIrrelevantPath(links: string[]) { - return links.filter((link) => { - const normalizedInitialUrl = new URL(this.urls[0]); - const normalizedLink = new URL(link); - - // Normalize the hostname to account for www and non-www versions - const initialHostname = normalizedInitialUrl.hostname.replace( - /^www\./, - "" - ); - const linkHostname = normalizedLink.hostname.replace(/^www\./, ""); - - // Ensure the protocol and hostname match, and the path starts with the initial URL's path - return ( - linkHostname === initialHostname && - normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname) - ); - }); - } - - private async handleCrawlMode( - inProgress?: (progress: Progress) => void - ): Promise { - let includes: string[]; - if (Array.isArray(this.includes)) { - if (this.includes[0] != "") { - includes = this.includes; - } - } else { - includes = this.includes.split(','); - } - - let excludes: string[]; - if (Array.isArray(this.excludes)) { - if (this.excludes[0] != "") { - excludes = this.excludes; - } - } else { - excludes = this.excludes.split(','); - } - - const crawler = new WebCrawler({ - jobId: this.jobId, - initialUrl: this.urls[0], - includes, - excludes, - maxCrawledLinks: this.maxCrawledLinks, - maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth), - limit: this.limit, - generateImgAltText: this.generateImgAltText, - allowBackwardCrawling: this.allowBackwardCrawling, - allowExternalContentLinks: this.allowExternalContentLinks, - }); - - let links = await crawler.start( - inProgress, - this.pageOptions, - { - ignoreSitemap: this.ignoreSitemap, - }, - 5, - this.limit, - this.maxCrawledDepth - ); - - let allLinks = links.map((e) => e.url); - const allHtmls = links.map((e) => e.html); - - if (this.returnOnlyUrls) { - return this.returnOnlyUrlsResponse(allLinks, inProgress); - } - - let documents = []; - // check if fast mode is enabled and there is html inside the links - if (this.crawlerMode === "fast" && links.some((link) => link.html)) { - documents = await this.processLinks(allLinks, inProgress, allHtmls); - } else { - documents = await this.processLinks(allLinks, inProgress); - } - - return this.cacheAndFinalizeDocuments(documents, allLinks); - } - - private async handleSingleUrlsMode( - inProgress?: (progress: Progress) => void - ): Promise { - const links = this.urls; - - let documents = await this.processLinks(links, inProgress); - return documents; - } - - private async handleSitemapMode( - inProgress?: (progress: Progress) => void - ): Promise { - let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] }); - links = await this.cleanIrrelevantPath(links); - - if (this.returnOnlyUrls) { - return this.returnOnlyUrlsResponse(links, inProgress); - } - - let documents = await this.processLinks(links, inProgress); - return this.cacheAndFinalizeDocuments(documents, links); - } - - private async returnOnlyUrlsResponse( - links: string[], - inProgress?: (progress: Progress) => void - ): Promise { - inProgress?.({ - current: links.length, - total: links.length, - status: "COMPLETED", - currentDocumentUrl: this.urls[0], - }); - return links.map((url) => ({ - content: "", - html: this.pageOptions?.includeHtml ? "" : undefined, - markdown: "", - metadata: { sourceURL: url, pageStatusCode: 200 }, - })); - } - - private async processLinks( - links: string[], - inProgress?: (progress: Progress) => void, - allHtmls?: string[] - ): Promise { - const pdfLinks = links.filter((link) => link.endsWith(".pdf")); - const docLinks = links.filter( - (link) => link.endsWith(".doc") || link.endsWith(".docx") - ); - - const [pdfDocuments, docxDocuments] = await Promise.all([ - this.fetchPdfDocuments(pdfLinks), - this.fetchDocxDocuments(docLinks), - ]); - - links = links.filter( - (link) => !pdfLinks.includes(link) && !docLinks.includes(link) - ); - - let [documents, sitemapData] = await Promise.all([ - this.convertUrlsToDocuments(links, inProgress, allHtmls), - this.mode === "single_urls" && links.length > 0 - ? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch( - (error) => { - Logger.debug(`Failed to fetch sitemap data: ${error}`); - return null; - } - ) - : Promise.resolve(null), - ]); - - if (this.mode === "single_urls" && documents.length > 0) { - documents[0].metadata.sitemap = sitemapData ?? undefined; - } else { - documents = await this.getSitemapData(this.urls[0], documents); - } - - if (this.pageOptions.includeMarkdown) { - documents = this.applyPathReplacements(documents); - } - - if (!this.pageOptions.includeHtml) { - for (let document of documents) { - delete document.html; - } - } - - // documents = await this.applyImgAltText(documents); - if (this.mode === "single_urls" && this.pageOptions.includeExtract) { - const extractionMode = this.extractorOptions?.mode ?? "markdown"; - const completionMode = extractionMode === "llm-extraction-from-raw-html" ? "raw-html" : "markdown"; - - if ( - extractionMode === "llm-extraction" || - extractionMode === "llm-extraction-from-markdown" || - extractionMode === "llm-extraction-from-raw-html" - ) { - documents = await generateCompletions( - documents, - this.extractorOptions, - completionMode - ); - } - } - return documents.concat(pdfDocuments).concat(docxDocuments); - } - - private async fetchPdfDocuments(pdfLinks: string[]): Promise { - return Promise.all( - pdfLinks.map(async (pdfLink) => { - const timer = Date.now(); - const logInsertPromise = ScrapeEvents.insert(this.jobId, { - type: "scrape", - url: pdfLink, - worker: process.env.FLY_MACHINE_ID, - method: "pdf-scrape", - result: null, - }); - - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( - pdfLink, - this.pageOptions.parsePDF - ); - - const insertedLogId = await logInsertPromise; - ScrapeEvents.updateScrapeResult(insertedLogId, { - response_size: content.length, - success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100), - error: pageError, - response_code: pageStatusCode, - time_taken: Date.now() - timer, - }); - return { - content: content, - markdown: content, - metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, - provider: "web-scraper", - }; - }) - ); - } - private async fetchDocxDocuments(docxLinks: string[]): Promise { - return Promise.all( - docxLinks.map(async (docxLink) => { - const timer = Date.now(); - const logInsertPromise = ScrapeEvents.insert(this.jobId, { - type: "scrape", - url: docxLink, - worker: process.env.FLY_MACHINE_ID, - method: "docx-scrape", - result: null, - }); - - const { content, pageStatusCode, pageError } = await fetchAndProcessDocx( - docxLink - ); - - const insertedLogId = await logInsertPromise; - ScrapeEvents.updateScrapeResult(insertedLogId, { - response_size: content.length, - success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100), - error: pageError, - response_code: pageStatusCode, - time_taken: Date.now() - timer, - }); - - return { - content, - metadata: { sourceURL: docxLink, pageStatusCode, pageError }, - provider: "web-scraper", - }; - }) - ); - } - - private applyPathReplacements(documents: Document[]): Document[] { - if (this.replaceAllPathsWithAbsolutePaths) { - documents = replacePathsWithAbsolutePaths(documents); - } - return replaceImgPathsWithAbsolutePaths(documents); - } - - private async applyImgAltText(documents: Document[]): Promise { - return this.generateImgAltText - ? this.generatesImgAltText(documents) - : documents; - } - - private async cacheAndFinalizeDocuments( - documents: Document[], - links: string[] - ): Promise { - // await this.setCachedDocuments(documents, links); - documents = this.removeChildLinks(documents); - return documents.splice(0, this.limit); - } - - private async processDocumentsWithCache( - inProgress?: (progress: Progress) => void - ): Promise { - let documents = await this.getCachedDocuments( - this.urls.slice(0, this.limit) - ); - if (documents.length < this.limit) { - const newDocuments: Document[] = await this.getDocuments( - false, - inProgress - ); - documents = this.mergeNewDocuments(documents, newDocuments); - } - documents = this.filterDocsExcludeInclude(documents); - documents = this.filterDepth(documents); - documents = this.removeChildLinks(documents); - return documents.splice(0, this.limit); - } - - private mergeNewDocuments( - existingDocuments: Document[], - newDocuments: Document[] - ): Document[] { - newDocuments.forEach((doc) => { - if ( - !existingDocuments.some( - (d) => - this.normalizeUrl(d.metadata.sourceURL) === - this.normalizeUrl(doc.metadata?.sourceURL) - ) - ) { - existingDocuments.push(doc); - } - }); - return existingDocuments; - } - - private filterDocsExcludeInclude(documents: Document[]): Document[] { - return documents.filter((document) => { - const url = new URL(document.metadata.sourceURL); - const path = url.pathname; - - if (!Array.isArray(this.excludes)) { - this.excludes = this.excludes.split(','); - } - - if (this.excludes.length > 0 && this.excludes[0] !== "") { - // Check if the link should be excluded - if ( - this.excludes.some((excludePattern) => - new RegExp(excludePattern).test(path) - ) - ) { - return false; - } - } - - if (!Array.isArray(this.includes)) { - this.includes = this.includes.split(','); - } - - if (this.includes.length > 0 && this.includes[0] !== "") { - // Check if the link matches the include patterns, if any are specified - if (this.includes.length > 0) { - return this.includes.some((includePattern) => - new RegExp(includePattern).test(path) - ); - } - } - return true; - }); - } - - private normalizeUrl(url: string): string { - if (url.includes("//www.")) { - return url.replace("//www.", "//"); - } - return url; - } - - private removeChildLinks(documents: Document[]): Document[] { - for (let document of documents) { - if (document?.childrenLinks) delete document.childrenLinks; - } - return documents; - } - - async setCachedDocuments(documents: Document[], childrenLinks?: string[]) { - for (const document of documents) { - if (document.content.trim().length === 0) { - continue; - } - const normalizedUrl = this.normalizeUrl(document.metadata.sourceURL); - await setValue( - "web-scraper-cache:" + normalizedUrl, - JSON.stringify({ - ...document, - childrenLinks: childrenLinks || [], - }), - 60 * 60 - ); // 10 days - } - } - - async getCachedDocuments(urls: string[]): Promise { - let documents: Document[] = []; - for (const url of urls) { - const normalizedUrl = this.normalizeUrl(url); - Logger.debug( - "Getting cached document for web-scraper-cache:" + normalizedUrl - ); - const cachedDocumentString = await getValue( - "web-scraper-cache:" + normalizedUrl - ); - if (cachedDocumentString) { - const cachedDocument = JSON.parse(cachedDocumentString); - documents.push(cachedDocument); - - // get children documents - for (const childUrl of cachedDocument.childrenLinks || []) { - const normalizedChildUrl = this.normalizeUrl(childUrl); - const childCachedDocumentString = await getValue( - "web-scraper-cache:" + normalizedChildUrl - ); - if (childCachedDocumentString) { - const childCachedDocument = JSON.parse(childCachedDocumentString); - if ( - !documents.find( - (doc) => - doc.metadata.sourceURL === - childCachedDocument.metadata.sourceURL - ) - ) { - documents.push(childCachedDocument); - } - } - } - } - } - return documents; - } - - setOptions(options: WebScraperOptions): void { - if (!options.urls) { - throw new Error("Urls are required"); - } - - this.jobId = options.jobId; - this.bullJobId = options.bullJobId; - this.urls = options.urls; - this.mode = options.mode; - this.concurrentRequests = options.concurrentRequests ?? 20; - this.includes = options.crawlerOptions?.includes ?? []; - this.excludes = options.crawlerOptions?.excludes ?? []; - this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000; - this.maxCrawledDepth = options.crawlerOptions?.maxDepth ?? 10; - this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false; - this.limit = options.crawlerOptions?.limit ?? 10000; - this.generateImgAltText = - options.crawlerOptions?.generateImgAltText ?? false; - this.pageOptions = { - onlyMainContent: options.pageOptions?.onlyMainContent ?? false, - includeHtml: options.pageOptions?.includeHtml ?? false, - replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true, - parsePDF: options.pageOptions?.parsePDF ?? true, - onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [], - removeTags: options.pageOptions?.removeTags ?? [], - includeMarkdown: options.pageOptions?.includeMarkdown ?? true, - includeRawHtml: options.pageOptions?.includeRawHtml ?? false, - includeExtract: options.pageOptions?.includeExtract ?? (options.extractorOptions?.mode && options.extractorOptions?.mode !== "markdown") ?? false, - waitFor: options.pageOptions?.waitFor ?? undefined, - headers: options.pageOptions?.headers ?? undefined, - includeLinks: options.pageOptions?.includeLinks ?? true, - fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false, - screenshot: options.pageOptions?.screenshot ?? false, - useFastMode: options.pageOptions?.useFastMode ?? false, - disableJsDom: options.pageOptions?.disableJsDom ?? false, - atsv: options.pageOptions?.atsv ?? false, - actions: options.pageOptions?.actions ?? undefined, - geolocation: options.pageOptions?.geolocation ?? undefined, - skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false, - removeBase64Images: options.pageOptions?.removeBase64Images ?? true, - mobile: options.pageOptions?.mobile ?? false, - }; - this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; - this.replaceAllPathsWithAbsolutePaths = - options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? - options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? - false; - - if (typeof options.crawlerOptions?.excludes === 'string') { - this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== ""); - } - - if (typeof options.crawlerOptions?.includes === 'string') { - this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== ""); - } - - this.crawlerMode = options.crawlerOptions?.mode ?? "default"; - this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; - this.allowBackwardCrawling = - options.crawlerOptions?.allowBackwardCrawling ?? false; - this.allowExternalContentLinks = - options.crawlerOptions?.allowExternalContentLinks ?? false; - this.priority = options.priority; - this.teamId = options.teamId ?? null; - - - - // make sure all urls start with https:// - this.urls = this.urls.map((url) => { - if (!url.trim().startsWith("http")) { - return `https://${url}`; - } - return url; - }); - } - - private async getSitemapData(baseUrl: string, documents: Document[]) { - const sitemapData = await fetchSitemapData(baseUrl); - if (sitemapData) { - for (let i = 0; i < documents.length; i++) { - const docInSitemapData = sitemapData.find( - (data) => - this.normalizeUrl(data.loc) === - this.normalizeUrl(documents[i].metadata.sourceURL) - ); - if (docInSitemapData) { - let sitemapDocData: Partial = {}; - if (docInSitemapData.changefreq) { - sitemapDocData.changefreq = docInSitemapData.changefreq; - } - if (docInSitemapData.priority) { - sitemapDocData.priority = Number(docInSitemapData.priority); - } - if (docInSitemapData.lastmod) { - sitemapDocData.lastmod = docInSitemapData.lastmod; - } - if (Object.keys(sitemapDocData).length !== 0) { - documents[i].metadata.sitemap = sitemapDocData; - } - } - } - } - return documents; - } - private async getSitemapDataForSingleUrl( - baseUrl: string, - url: string, - timeout?: number - ) { - const sitemapData = await fetchSitemapData(baseUrl, timeout); - if (sitemapData) { - const docInSitemapData = sitemapData.find( - (data) => this.normalizeUrl(data.loc) === this.normalizeUrl(url) - ); - if (docInSitemapData) { - let sitemapDocData: Partial = {}; - if (docInSitemapData.changefreq) { - sitemapDocData.changefreq = docInSitemapData.changefreq; - } - if (docInSitemapData.priority) { - sitemapDocData.priority = Number(docInSitemapData.priority); - } - if (docInSitemapData.lastmod) { - sitemapDocData.lastmod = docInSitemapData.lastmod; - } - if (Object.keys(sitemapDocData).length !== 0) { - return sitemapDocData; - } - } - } - return null; - } - generatesImgAltText = async (documents: Document[]): Promise => { - await Promise.all( - documents.map(async (document) => { - const images = document.content.match(/!\[.*?\]\((.*?)\)/g) || []; - - await Promise.all( - images.map(async (image: string) => { - let imageUrl = image.match(/\(([^)]+)\)/)[1]; - let altText = image.match(/\[(.*?)\]/)[1]; - - if ( - !altText && - !imageUrl.startsWith("data:image") && - /\.(png|jpeg|gif|webp)$/.test(imageUrl) - ) { - const imageIndex = document.content.indexOf(image); - const contentLength = document.content.length; - let backText = document.content.substring( - imageIndex + image.length, - Math.min(imageIndex + image.length + 1000, contentLength) - ); - let frontTextStartIndex = Math.max(imageIndex - 1000, 0); - let frontText = document.content.substring( - frontTextStartIndex, - imageIndex - ); - altText = await getImageDescription( - imageUrl, - backText, - frontText, - this.generateImgAltTextModel - ); - } - - document.content = document.content.replace( - image, - `![${altText}](${imageUrl})` - ); - }) - ); - }) - ); - - return documents; - }; - - filterDepth(documents: Document[]): Document[] { - return documents.filter((document) => { - const url = new URL(document.metadata.sourceURL); - return getURLDepth(url.toString()) <= this.maxCrawledDepth; - }); - } -} diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts deleted file mode 100644 index 0df3be72..00000000 --- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts +++ /dev/null @@ -1,89 +0,0 @@ -import axios from "axios"; -import { logScrape } from "../../../services/logging/scrape_log"; -import { fetchAndProcessPdf } from "../utils/pdfProcessor"; -import { universalTimeout } from "../global"; -import { Logger } from "../../../lib/logger"; - -/** - * Scrapes a URL with Axios - * @param url The URL to scrape - * @param pageOptions The options for the page - * @returns The scraped content - */ -export async function scrapWithFetch( - url: string, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - const logParams = { - url, - scraper: "fetch", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: null, - html: "", - startTime: Date.now(), - }; - - try { - const response = await axios.get(url, { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout, - transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically - }); - - if (response.status !== 200) { - Logger.debug( - `⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}` - ); - logParams.error_message = response.statusText; - logParams.response_code = response.status; - return { - content: "", - pageStatusCode: response.status, - pageError: response.statusText, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - logParams.success = true; - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( - url, - pageOptions?.parsePDF - ); - logParams.response_code = pageStatusCode; - logParams.error_message = pageError; - return { content, pageStatusCode: response.status, pageError }; - } else { - const text = response.data; - logParams.success = true; - logParams.html = text; - logParams.response_code = response.status; - return { - content: text, - pageStatusCode: response.status, - pageError: null, - }; - } - } catch (error) { - if (error.code === "ECONNABORTED") { - logParams.error_message = "Request timed out"; - Logger.debug(`⛏️ Axios: Request timed out for ${url}`); - } else { - logParams.error_message = error.message || error; - Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`); - } - return { - content: "", - pageStatusCode: error.response?.status ?? null, - pageError: logParams.error_message, - }; - } finally { - const endTime = Date.now(); - logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams); - } -} diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts deleted file mode 100644 index 7616774a..00000000 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ /dev/null @@ -1,230 +0,0 @@ -import axios from "axios"; -import { Action, FireEngineOptions, FireEngineResponse } from "../../../lib/entities"; -import { logScrape } from "../../../services/logging/scrape_log"; -import { generateRequestParams } from "../single_url"; -import { fetchAndProcessPdf } from "../utils/pdfProcessor"; -import { universalTimeout } from "../global"; -import { Logger } from "../../../lib/logger"; -import * as Sentry from "@sentry/node"; -import axiosRetry from 'axios-retry'; - -axiosRetry(axios, { retries: 3 , onRetry:()=>{ - console.log("Retrying (fire-engine)..."); -}, retryDelay: axiosRetry.exponentialDelay}); -/** - * Scrapes a URL with Fire-Engine - * @param url The URL to scrape - * @param waitFor The time to wait for the page to load - * @param screenshot Whether to take a screenshot - * @param fullPageScreenshot Whether to take a full page screenshot - * @param pageOptions The options for the page - * @param headers The headers to send with the request - * @param options The options for the request - * @returns The scraped content - */ -export async function scrapWithFireEngine({ - url, - actions, - waitFor = 0, - screenshot = false, - fullPageScreenshot = false, - pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, removeBase64Images: true, mobile: false }, - fireEngineOptions = {}, - headers, - options, - priority, - teamId, -}: { - url: string; - actions?: Action[]; - waitFor?: number; - screenshot?: boolean; - fullPageScreenshot?: boolean; - pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, removeBase64Images?: boolean, mobile?: boolean }; - fireEngineOptions?: FireEngineOptions; - headers?: Record; - options?: any; - priority?: number; - teamId?: string; -}): Promise { - const logParams = { - url, - scraper: "fire-engine", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: null, - html: "", - startTime: Date.now(), - }; - - try { - const reqParams = await generateRequestParams(url); - let waitParam = reqParams["params"]?.wait ?? waitFor; - let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp"; - let screenshotParam = reqParams["params"]?.screenshot ?? screenshot; - let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; - let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; - - - let endpoint = "/scrape"; - - if(options?.endpoint === "request") { - endpoint = "/request"; - } - - let engine = engineParam; // do we want fireEngineOptions as first choice? - - if (pageOptions?.useFastMode) { - fireEngineOptionsParam.engine = "tlsclient"; - engine = "tlsclient"; - } - - Logger.info( - `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { actions: ${JSON.stringify((actions ?? []).map(x => x.type))}, method: ${fireEngineOptionsParam?.method ?? "null"} }` - ); - - // atsv is only available for beta customers - const betaCustomersString = process.env.BETA_CUSTOMERS; - const betaCustomers = betaCustomersString ? betaCustomersString.split(",") : []; - - if (pageOptions?.atsv && betaCustomers.includes(teamId)) { - fireEngineOptionsParam.atsv = true; - } else { - pageOptions.atsv = false; - } - - const axiosInstance = axios.create({ - headers: { "Content-Type": "application/json" } - }); - - const startTime = Date.now(); - const _response = await Sentry.startSpan({ - name: "Call to fire-engine" - }, async span => { - - return await axiosInstance.post( - process.env.FIRE_ENGINE_BETA_URL + endpoint, - { - url: url, - headers: headers, - wait: waitParam, - screenshot: screenshotParam, - fullPageScreenshot: fullPageScreenshotParam, - disableJsDom: pageOptions?.disableJsDom ?? false, - priority, - engine, - instantReturn: true, - mobile: pageOptions?.mobile ?? false, - ...fireEngineOptionsParam, - atsv: pageOptions?.atsv ?? false, - scrollXPaths: pageOptions?.scrollXPaths ?? [], - geolocation: pageOptions?.geolocation, - skipTlsVerification: pageOptions?.skipTlsVerification ?? false, - removeBase64Images: pageOptions?.removeBase64Images ?? true, - actions: actions, - }, - { - headers: { - "Content-Type": "application/json", - ...(Sentry.isInitialized() ? ({ - "sentry-trace": Sentry.spanToTraceHeader(span), - "baggage": Sentry.spanToBaggageHeader(span), - }) : {}), - } - } - ); - }); - - const waitTotal = (actions ?? []).filter(x => x.type === "wait").reduce((a, x) => (x as { type: "wait"; milliseconds: number; }).milliseconds + a, 0); - - let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); - - // added 5 seconds to the timeout to account for 'smart wait' - while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitTotal + 5000) { - await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds - checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`); - } - - if (checkStatusResponse.data.processing) { - Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`); - axiosInstance.delete( - process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`, { - validateStatus: (status) => true - } - ).catch((error) => { - Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`); - }); - - Logger.debug(`⛏️ Fire-Engine (${engine}): Request timed out for ${url}`); - logParams.error_message = "Request timed out"; - return { html: "", pageStatusCode: null, pageError: "" }; - } - - if (checkStatusResponse.status !== 200 || checkStatusResponse.data.error) { - Logger.debug( - `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.status}\t ${checkStatusResponse.data.error}` - ); - - logParams.error_message = checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error; - logParams.response_code = checkStatusResponse.data?.pageStatusCode; - - if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) { - Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`); - } - - const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined; - - return { - html: "", - pageStatusCode, - pageError: checkStatusResponse.data?.pageError ?? checkStatusResponse.data?.error, - }; - } - - const contentType = checkStatusResponse.data.responseHeaders?.["content-type"]; - - if (contentType && contentType.includes("application/pdf")) { - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( - url, - pageOptions?.parsePDF - ); - logParams.success = true; - logParams.response_code = pageStatusCode; - logParams.error_message = pageError; - return { html: content, pageStatusCode, pageError }; - } else { - const data = checkStatusResponse.data; - - logParams.success = - (data.pageStatusCode >= 200 && data.pageStatusCode < 300) || - data.pageStatusCode === 404; - logParams.html = data.content ?? ""; - logParams.response_code = data.pageStatusCode; - logParams.error_message = data.pageError ?? data.error; - - return { - html: data.content ?? "", - screenshots: data.screenshots ?? [data.screenshot] ?? [], - pageStatusCode: data.pageStatusCode, - pageError: data.pageError ?? data.error, - scrapeActionContent: data?.actionContent ?? [], - }; - } - } catch (error) { - if (error.code === "ECONNABORTED") { - Logger.debug(`⛏️ Fire-Engine (catch block): Request timed out for ${url}`); - logParams.error_message = "Request timed out"; - } else { - Logger.debug(`⛏️ Fire-Engine(catch block): Failed to fetch url: ${url} | Error: ${error}`); - logParams.error_message = error.message || error; - } - return { html: "", pageStatusCode: null, pageError: logParams.error_message }; - } finally { - const endTime = Date.now(); - logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams, pageOptions); - } -} - - diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts deleted file mode 100644 index 09c7353b..00000000 --- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts +++ /dev/null @@ -1,111 +0,0 @@ -import axios from "axios"; -import { logScrape } from "../../../services/logging/scrape_log"; -import { generateRequestParams } from "../single_url"; -import { fetchAndProcessPdf } from "../utils/pdfProcessor"; -import { universalTimeout } from "../global"; -import { Logger } from "../../../lib/logger"; - -/** - * Scrapes a URL with Playwright - * @param url The URL to scrape - * @param waitFor The time to wait for the page to load - * @param headers The headers to send with the request - * @param pageOptions The options for the page - * @returns The scraped content - */ -export async function scrapWithPlaywright( - url: string, - waitFor: number = 0, - headers?: Record, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - const logParams = { - url, - scraper: "playwright", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: null, - html: "", - startTime: Date.now(), - }; - - try { - const reqParams = await generateRequestParams(url); - // If the user has passed a wait parameter in the request, use that - const waitParam = reqParams["params"]?.wait ?? waitFor; - - const response = await axios.post( - process.env.PLAYWRIGHT_MICROSERVICE_URL, - { - url: url, - wait_after_load: waitParam, - timeout: universalTimeout + waitParam, - headers: headers, - }, - { - headers: { - "Content-Type": "application/json", - }, - timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time - transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically - } - ); - - if (response.status !== 200) { - Logger.debug( - `⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}` - ); - logParams.error_message = response.data?.pageError; - logParams.response_code = response.data?.pageStatusCode; - return { - content: "", - pageStatusCode: response.data?.pageStatusCode, - pageError: response.data?.pageError, - }; - } - - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - logParams.success = true; - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); - logParams.response_code = pageStatusCode; - logParams.error_message = pageError; - return { content, pageStatusCode, pageError }; - } else { - const textData = response.data; - try { - const data = JSON.parse(textData); - const html = data.content; - logParams.success = true; - logParams.html = html; - logParams.response_code = data.pageStatusCode; - logParams.error_message = data.pageError; - return { - content: html ?? "", - pageStatusCode: data.pageStatusCode, - pageError: data.pageError, - }; - } catch (jsonError) { - logParams.error_message = jsonError.message || jsonError; - Logger.debug( - `⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}` - ); - return { content: "", pageStatusCode: null, pageError: logParams.error_message }; - } - } - } catch (error) { - if (error.code === "ECONNABORTED") { - logParams.error_message = "Request timed out"; - Logger.debug(`⛏️ Playwright: Request timed out for ${url}`); - } else { - logParams.error_message = error.message || error; - Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`); - } - return { content: "", pageStatusCode: null, pageError: logParams.error_message }; - } finally { - const endTime = Date.now(); - logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams); - } -} diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts deleted file mode 100644 index b72fa8b2..00000000 --- a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts +++ /dev/null @@ -1,92 +0,0 @@ -import { logScrape } from "../../../services/logging/scrape_log"; -import { generateRequestParams } from "../single_url"; -import { fetchAndProcessPdf } from "../utils/pdfProcessor"; -import { universalTimeout } from "../global"; -import { ScrapingBeeClient } from "scrapingbee"; -import { Logger } from "../../../lib/logger"; - -/** - * Scrapes a URL with ScrapingBee - * @param url The URL to scrape - * @param wait_browser The browser event to wait for - * @param timeout The timeout for the scrape - * @param pageOptions The options for the page - * @returns The scraped content - */ -export async function scrapWithScrapingBee( - url: string, - wait_browser: string = "domcontentloaded", - timeout: number = universalTimeout, - pageOptions: { parsePDF?: boolean } = { parsePDF: true } - ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { - const logParams = { - url, - scraper: wait_browser === "networkidle2" ? "scrapingBeeLoad" : "scrapingBee", - success: false, - response_code: null, - time_taken_seconds: null, - error_message: null, - html: "", - startTime: Date.now(), - }; - try { - const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); - const clientParams = await generateRequestParams( - url, - wait_browser, - timeout - ); - const response = await client.get({ - ...clientParams, - params: { - ...clientParams.params, - transparent_status_code: "True", - }, - }); - Logger.info( - `⛏️ ScrapingBee: Scraping ${url}` - ); - const contentType = response.headers["content-type"]; - if (contentType && contentType.includes("application/pdf")) { - logParams.success = true; - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); - logParams.response_code = pageStatusCode; - logParams.error_message = pageError; - return { content, pageStatusCode, pageError }; - } else { - let text = ""; - try { - const decoder = new TextDecoder(); - text = decoder.decode(response.data); - logParams.success = true; - } catch (decodeError) { - Logger.debug( - `⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}` - ); - logParams.error_message = decodeError.message || decodeError; - } - logParams.response_code = response.status; - logParams.html = text; - logParams.success = response.status >= 200 && response.status < 300 || response.status === 404; - logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined; - return { - content: text, - pageStatusCode: response.status, - pageError: response.statusText !== "OK" ? response.statusText : undefined, - }; - } - } catch (error) { - Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`); - logParams.error_message = error.message || error; - logParams.response_code = error.response?.status; - return { - content: "", - pageStatusCode: error.response?.status, - pageError: error.response?.statusText, - }; - } finally { - const endTime = Date.now(); - logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; - await logScrape(logParams); - } - } \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts deleted file mode 100644 index 38ec74f0..00000000 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ /dev/null @@ -1,506 +0,0 @@ -import * as cheerio from "cheerio"; -import { extractMetadata } from "./utils/metadata"; -import dotenv from "dotenv"; -import { - Document, - PageOptions, - FireEngineResponse, - ExtractorOptions, - Action, -} from "../../lib/entities"; -import { parseMarkdown } from "../../lib/html-to-markdown"; -import { urlSpecificParams } from "./utils/custom/website_params"; -import { fetchAndProcessPdf } from "./utils/pdfProcessor"; -import { handleCustomScraping } from "./custom/handleCustomScraping"; -import { removeUnwantedElements } from "./utils/removeUnwantedElements"; -import { scrapWithFetch } from "./scrapers/fetch"; -import { scrapWithFireEngine } from "./scrapers/fireEngine"; -import { scrapWithPlaywright } from "./scrapers/playwright"; -import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; -import { extractLinks } from "./utils/utils"; -import { Logger } from "../../lib/logger"; -import { ScrapeEvents } from "../../lib/scrape-events"; -import { clientSideError } from "../../strings"; -import { ScrapeActionContent } from "../../lib/entities"; -import { removeBase64Images } from "./utils/removeBase64Images"; - -dotenv.config(); - -const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined; -const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined; - -export const baseScrapers = [ - useFireEngine ? "fire-engine;chrome-cdp" : undefined, - useFireEngine ? "fire-engine" : undefined, - useScrapingBee ? "scrapingBee" : undefined, - useFireEngine ? undefined : "playwright", - useScrapingBee ? "scrapingBeeLoad" : undefined, - "fetch", -].filter(Boolean); - -export async function generateRequestParams( - url: string, - wait_browser: string = "domcontentloaded", - timeout: number = 15000 -): Promise { - const defaultParams = { - url: url, - params: { timeout: timeout, wait_browser: wait_browser }, - headers: { "ScrapingService-Request": "TRUE" }, - }; - - try { - const urlKey = new URL(url).hostname.replace(/^www\./, ""); - if (urlSpecificParams.hasOwnProperty(urlKey)) { - return { ...defaultParams, ...urlSpecificParams[urlKey] }; - } else { - return defaultParams; - } - } catch (error) { - Logger.error(`Error generating URL key: ${error}`); - return defaultParams; - } -} - -/** - * Get the order of scrapers to be used for scraping a URL - * If the user doesn't have envs set for a specific scraper, it will be removed from the order. - * @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined - * @returns The order of scrapers to be used for scraping a URL - */ -function getScrapingFallbackOrder( - defaultScraper?: string, - isWaitPresent: boolean = false, - isScreenshotPresent: boolean = false, - isHeadersPresent: boolean = false, - isActionsPresent: boolean = false, -) { - if (isActionsPresent) { - return useFireEngine ? ["fire-engine;chrome-cdp"] : []; - } - - const availableScrapers = baseScrapers.filter((scraper) => { - switch (scraper) { - case "scrapingBee": - case "scrapingBeeLoad": - return !!process.env.SCRAPING_BEE_API_KEY; - case "fire-engine": - return !!process.env.FIRE_ENGINE_BETA_URL; - case "fire-engine;chrome-cdp": - return !!process.env.FIRE_ENGINE_BETA_URL; - case "playwright": - return !!process.env.PLAYWRIGHT_MICROSERVICE_URL; - default: - return true; - } - }); - - let defaultOrder = [ - useFireEngine ? "fire-engine;chrome-cdp" : undefined, - useFireEngine ? "fire-engine" : undefined, - useScrapingBee ? "scrapingBee" : undefined, - useScrapingBee ? "scrapingBeeLoad" : undefined, - useFireEngine ? undefined : "playwright", - "fetch", - ].filter(Boolean); - - // if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { - // defaultOrder = [ - // "fire-engine", - // useFireEngine ? undefined : "playwright", - // ...defaultOrder.filter( - // (scraper) => scraper !== "fire-engine" && scraper !== "playwright" - // ), - // ].filter(Boolean); - // } - - const filteredDefaultOrder = defaultOrder.filter( - (scraper: (typeof baseScrapers)[number]) => - availableScrapers.includes(scraper) - ); - const uniqueScrapers = new Set( - defaultScraper - ? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers] - : [...filteredDefaultOrder, ...availableScrapers] - ); - - const scrapersInOrder = Array.from(uniqueScrapers); - return scrapersInOrder as (typeof baseScrapers)[number][]; -} - - - -export async function scrapSingleUrl( - jobId: string, - urlToScrap: string, - pageOptions: PageOptions, - extractorOptions?: ExtractorOptions, - existingHtml?: string, - priority?: number, - teamId?: string -): Promise { - pageOptions = { - includeMarkdown: pageOptions.includeMarkdown ?? true, - includeExtract: pageOptions.includeExtract ?? false, - onlyMainContent: pageOptions.onlyMainContent ?? false, - includeHtml: pageOptions.includeHtml ?? false, - includeRawHtml: pageOptions.includeRawHtml ?? false, - waitFor: pageOptions.waitFor ?? undefined, - screenshot: pageOptions.screenshot ?? false, - fullPageScreenshot: pageOptions.fullPageScreenshot ?? false, - headers: pageOptions.headers ?? undefined, - includeLinks: pageOptions.includeLinks ?? true, - replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true, - parsePDF: pageOptions.parsePDF ?? true, - removeTags: pageOptions.removeTags ?? [], - onlyIncludeTags: pageOptions.onlyIncludeTags ?? [], - useFastMode: pageOptions.useFastMode ?? false, - disableJsDom: pageOptions.disableJsDom ?? false, - atsv: pageOptions.atsv ?? false, - actions: pageOptions.actions ?? undefined, - geolocation: pageOptions.geolocation ?? undefined, - skipTlsVerification: pageOptions.skipTlsVerification ?? false, - removeBase64Images: pageOptions.removeBase64Images ?? true, - mobile: pageOptions.mobile ?? false, - } - - if (extractorOptions) { - extractorOptions = { - mode: extractorOptions?.mode ?? "llm-extraction-from-markdown", - } - } - - if (!existingHtml) { - existingHtml = ""; - } - - urlToScrap = urlToScrap.trim(); - - const attemptScraping = async ( - url: string, - method: (typeof baseScrapers)[number] - ) => { - let scraperResponse: { - text: string; - screenshot: string; - actions?: { - screenshots?: string[]; - scrapes?: ScrapeActionContent[]; - }; - metadata: { pageStatusCode?: number; pageError?: string | null }; - } = { text: "", screenshot: "", metadata: {} }; - let screenshot = ""; - - const timer = Date.now(); - const logInsertPromise = ScrapeEvents.insert(jobId, { - type: "scrape", - url, - worker: process.env.FLY_MACHINE_ID, - method, - result: null, - }); - - switch (method) { - case "fire-engine": - case "fire-engine;chrome-cdp": - - let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright"; - if (method === "fire-engine;chrome-cdp") { - engine = "chrome-cdp"; - } - - if (process.env.FIRE_ENGINE_BETA_URL) { - const processedActions: Action[] = pageOptions.actions?.flatMap((action: Action, index: number, array: Action[]) => { - if (action.type === "click" || action.type === "write" || action.type === "press") { - const result: Action[] = []; - // Don't add a wait if the previous action is a wait - // if (index === 0 || array[index - 1].type !== "wait") { - // result.push({ type: "wait", milliseconds: 1200 } as Action); - // } - // Fire-engine now handles wait times automatically, leaving the code here for now - result.push(action); - // Don't add a wait if the next action is a wait - // if (index === array.length - 1 || array[index + 1].type !== "wait") { - // result.push({ type: "wait", milliseconds: 1200 } as Action); - // } - return result; - } - return [action as Action]; - }) ?? [] as Action[]; - - const response = await scrapWithFireEngine({ - url, - ...(engine === "chrome-cdp" ? ({ - actions: [ - ...(pageOptions.waitFor ? [{ - type: "wait" as const, - milliseconds: pageOptions.waitFor, - }] : []), - ...((pageOptions.screenshot || pageOptions.fullPageScreenshot) ? [{ - type: "screenshot" as const, - fullPage: !!pageOptions.fullPageScreenshot, - }] : []), - ...processedActions, - ], - }) : ({ - waitFor: pageOptions.waitFor, - screenshot: pageOptions.screenshot, - fullPageScreenshot: pageOptions.fullPageScreenshot, - })), - pageOptions: pageOptions, - headers: pageOptions.headers, - fireEngineOptions: { - engine: engine, - atsv: pageOptions.atsv, - disableJsDom: pageOptions.disableJsDom, - }, - priority, - teamId, - }); - scraperResponse.text = response.html; - if (pageOptions.screenshot || pageOptions.fullPageScreenshot) { - scraperResponse.screenshot = (response.screenshots ?? []).splice(0, 1)[0] ?? ""; - } - if (pageOptions.actions) { - scraperResponse.actions = { - screenshots: response.screenshots ?? [], - scrapes: response.scrapeActionContent ?? [], - }; - } - scraperResponse.metadata.pageStatusCode = response.pageStatusCode; - scraperResponse.metadata.pageError = response.pageError; - } - break; - case "scrapingBee": - if (process.env.SCRAPING_BEE_API_KEY) { - const response = await scrapWithScrapingBee( - url, - "domcontentloaded", - pageOptions.fallback === false ? 7000 : 15000 - ); - scraperResponse.text = response.content; - scraperResponse.metadata.pageStatusCode = response.pageStatusCode; - scraperResponse.metadata.pageError = response.pageError; - } - break; - case "playwright": - if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { - const response = await scrapWithPlaywright( - url, - pageOptions.waitFor, - pageOptions.headers - ); - scraperResponse.text = response.content; - scraperResponse.metadata.pageStatusCode = response.pageStatusCode; - scraperResponse.metadata.pageError = response.pageError; - } - break; - case "scrapingBeeLoad": - if (process.env.SCRAPING_BEE_API_KEY) { - const response = await scrapWithScrapingBee(url, "networkidle2"); - scraperResponse.text = response.content; - scraperResponse.metadata.pageStatusCode = response.pageStatusCode; - scraperResponse.metadata.pageError = response.pageError; - } - break; - case "fetch": - const response = await scrapWithFetch(url); - scraperResponse.text = response.content; - scraperResponse.metadata.pageStatusCode = response.pageStatusCode; - scraperResponse.metadata.pageError = response.pageError; - break; - } - - let customScrapedContent: FireEngineResponse | null = null; - - // Check for custom scraping conditions - const customScraperResult = await handleCustomScraping( - scraperResponse.text, - url - ); - - if (customScraperResult) { - switch (customScraperResult.scraper) { - case "fire-engine": - customScrapedContent = await scrapWithFireEngine({ - url: customScraperResult.url, - actions: customScraperResult.waitAfterLoad ? ([ - { - type: "wait", - milliseconds: customScraperResult.waitAfterLoad, - } - ]) : ([]), - pageOptions: customScraperResult.pageOptions, - }); - break; - case "pdf": - const { content, pageStatusCode, pageError } = - await fetchAndProcessPdf( - customScraperResult.url, - pageOptions?.parsePDF - ); - customScrapedContent = { - html: content, - pageStatusCode, - pageError, - }; - break; - } - } - - if (customScrapedContent) { - scraperResponse.text = customScrapedContent.html; - } - //* TODO: add an optional to return markdown or structured/extracted content - let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); - let text = await parseMarkdown(cleanedHtml); - if (pageOptions.removeBase64Images) { - text = await removeBase64Images(text); - } - - const insertedLogId = await logInsertPromise; - ScrapeEvents.updateScrapeResult(insertedLogId, { - response_size: scraperResponse.text.length, - success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100), - error: scraperResponse.metadata.pageError, - response_code: scraperResponse.metadata.pageStatusCode, - time_taken: Date.now() - timer, - }); - - return { - text, - html: cleanedHtml, - rawHtml: scraperResponse.text, - screenshot: scraperResponse.screenshot, - actions: scraperResponse.actions, - pageStatusCode: scraperResponse.metadata.pageStatusCode, - pageError: scraperResponse.metadata.pageError || undefined, - }; - }; - - let { text, html, rawHtml, screenshot, actions, pageStatusCode, pageError } = { - text: "", - html: "", - rawHtml: "", - screenshot: "", - actions: undefined, - pageStatusCode: 200, - pageError: undefined, - }; - try { - let urlKey = urlToScrap; - try { - urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); - } catch (error) { - Logger.error(`Invalid URL key, trying: ${urlToScrap}`); - } - const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; - const scrapersInOrder = getScrapingFallbackOrder( - defaultScraper, - pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, - pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true), - pageOptions && pageOptions.headers && pageOptions.headers !== undefined, - pageOptions && Array.isArray(pageOptions.actions) && pageOptions.actions.length > 0, - ); - - for (const scraper of scrapersInOrder) { - // If exists text coming from crawler, use it - if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) { - let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions); - text = await parseMarkdown(cleanedHtml); - html = cleanedHtml; - break; - } - - const attempt = await attemptScraping(urlToScrap, scraper); - text = attempt.text ?? ""; - html = attempt.html ?? ""; - rawHtml = attempt.rawHtml ?? ""; - screenshot = attempt.screenshot ?? ""; - actions = attempt.actions ?? undefined; - - if (attempt.pageStatusCode) { - pageStatusCode = attempt.pageStatusCode; - } - - if (attempt.pageError && (attempt.pageStatusCode >= 400 || scrapersInOrder.indexOf(scraper) === scrapersInOrder.length - 1)) { // force pageError if it's the last scraper and it failed too - pageError = attempt.pageError; - - if (attempt.pageStatusCode < 400 || !attempt.pageStatusCode) { - pageStatusCode = 500; - } - } else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) { - pageError = undefined; - } - - if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) { - Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`); - break; - } - if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 400)) { - Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code ${pageStatusCode}, breaking`); - break; - } - // const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; - // if (nextScraperIndex < scrapersInOrder.length) { - // Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`); - // } - } - - if (!text) { - throw new Error(`All scraping methods failed for URL: ${urlToScrap}`); - } - - const soup = cheerio.load(rawHtml); - const metadata = extractMetadata(soup, urlToScrap); - - let linksOnPage: string[] | undefined; - - if (pageOptions.includeLinks) { - linksOnPage = extractLinks(rawHtml, urlToScrap); - } - - let document: Document = { - content: text, - markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined, - html: pageOptions.includeHtml ? html : undefined, - rawHtml: - pageOptions.includeRawHtml || - (extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract) - ? rawHtml - : undefined, - linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined, - actions, - metadata: { - ...metadata, - ...(screenshot && screenshot.length > 0 ? ({ - screenshot, - }) : {}), - sourceURL: urlToScrap, - pageStatusCode: pageStatusCode, - pageError: pageError, - }, - }; - - return document; - } catch (error) { - Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); - ScrapeEvents.insert(jobId, { - type: "error", - message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error), - stack: error.stack, - }); - - return { - content: "", - markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined, - html: "", - linksOnPage: pageOptions.includeLinks ? [] : undefined, - metadata: { - sourceURL: urlToScrap, - pageStatusCode: pageStatusCode, - pageError: pageError, - }, - } as Document; - } -} diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 756cd765..05b3d00d 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -1,9 +1,10 @@ import axios from "axios"; import { axiosTimeout } from "../../lib/timeout"; import { parseStringPromise } from "xml2js"; -import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { WebCrawler } from "./crawler"; -import { Logger } from "../../lib/logger"; +import { logger } from "../../lib/logger"; +import { scrapeURL } from "../scrapeURL"; +import { scrapeOptions } from "../../controllers/v1/types"; export async function getLinksFromSitemap( { @@ -17,17 +18,20 @@ export async function getLinksFromSitemap( } ): Promise { try { - let content: string; + let content: string = ""; try { if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') { const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } else if (mode === 'fire-engine') { - const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"playwright" } }); - content = response.html; + const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });; + if (!response.success) { + throw response.error; + } + content = response.document.rawHtml!; } } catch (error) { - Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`); + logger.error(`Request failed for ${sitemapUrl}: ${error.message}`); return allUrls; } @@ -47,7 +51,7 @@ export async function getLinksFromSitemap( allUrls.push(...validUrls); } } catch (error) { - Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`); + logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`); } return allUrls; diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts deleted file mode 100644 index 53237ef8..00000000 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/docxProcessor.test.ts +++ /dev/null @@ -1,15 +0,0 @@ -import * as docxProcessor from "../docxProcessor"; - -describe("DOCX Processing Module - Integration Test", () => { - it("should correctly process a simple DOCX file without the LLAMAPARSE_API_KEY", async () => { - delete process.env.LLAMAPARSE_API_KEY; - const { content, pageStatusCode, pageError } = await docxProcessor.fetchAndProcessDocx( - "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx" - ); - expect(content.trim()).toContain( - "SERIES A PREFERRED STOCK PURCHASE AGREEMENT" - ); - expect(pageStatusCode).toBe(200); - expect(pageError).toBeUndefined(); - }); -}); diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts deleted file mode 100644 index 8d644c7b..00000000 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts +++ /dev/null @@ -1,128 +0,0 @@ -import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable'; -import cheerio from 'cheerio'; - -describe('parseTablesToMarkdown', () => { - it('converts a simple HTML table to Markdown', async () => { - const html = ` - - - - -
Header 1Header 2
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
- `; - const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; - const markdown = await parseTablesToMarkdown(html); - expect(markdown).toBe(expectedMarkdown); - }); - - it('converts a table with a single row to Markdown', async () => { - const html = ` - - - -
Header 1Header 2
Row 1 Col 1Row 1 Col 2
- `; - const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`; - const markdown = await parseTablesToMarkdown(html); - expect(markdown).toBe(expectedMarkdown); - }); - - it('converts a table with a single column to Markdown', async () => { - const html = ` - - - - -
Header 1
Row 1 Col 1
Row 2 Col 1
- `; - const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`; - const markdown = await parseTablesToMarkdown(html); - expect(markdown).toBe(expectedMarkdown); - }); - - it('converts a table with a single cell to Markdown', async () => { - const html = ` - - - -
Header 1
Row 1 Col 1
- `; - const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |
`; - const markdown = await parseTablesToMarkdown(html); - expect(markdown).toBe(expectedMarkdown); - }); - - it('converts a table with no header to Markdown', async () => { - const html = ` - - - -
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
- `; - const expectedMarkdown = `
| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; - const markdown = await parseTablesToMarkdown(html); - expect(markdown).toBe(expectedMarkdown); - }); - - it('converts a table with no rows to Markdown', async () => { - const html = ` - -
- `; - const expectedMarkdown = `
`; - const markdown = await parseTablesToMarkdown(html); - expect(markdown).toBe(expectedMarkdown); - }); - - it('converts a table with no cells to Markdown', async () => { - const html = ` - - -
- `; - const expectedMarkdown = `
`; - const markdown = await parseTablesToMarkdown(html); - expect(markdown).toBe(expectedMarkdown); - }); - - it('converts a table with no columns to Markdown', async () => { - const html = ` - - -
- `; - const expectedMarkdown = `
`; - const markdown = await parseTablesToMarkdown(html); - expect(markdown).toBe(expectedMarkdown); - }); - - it('converts a table with no table to Markdown', async () => { - const html = ``; - const expectedMarkdown = ``; - const markdown = await parseTablesToMarkdown(html); - expect(markdown).toBe(expectedMarkdown); - }); - -it('converts a table inside of a bunch of html noise', async () => { - const html = ` -
-

Some text before

- - - -
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
-

Some text after

-
- `; - const expectedMarkdown = `
-

Some text before

-
| Row 1 Col 1 | Row 1 Col 2 | -| Row 2 Col 1 | Row 2 Col 2 |
-

Some text after

-
`; - - const markdown = await parseTablesToMarkdown(html); - expect(markdown).toBe(expectedMarkdown); -}); - -}); diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts deleted file mode 100644 index 18302654..00000000 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts +++ /dev/null @@ -1,19 +0,0 @@ -import * as pdfProcessor from '../pdfProcessor'; - -describe('PDF Processing Module - Integration Test', () => { - it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { - delete process.env.LLAMAPARSE_API_KEY; - const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf', true); - expect(content.trim()).toEqual("Dummy PDF file"); - expect(pageStatusCode).toEqual(200); - expect(pageError).toBeUndefined(); - }); - - it('should return a successful response for a valid scrape with PDF file and parsePDF set to false', async () => { - const { content, pageStatusCode, pageError } = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/astro-ph/9301001.pdf', false); - expect(pageStatusCode).toBe(200); - expect(pageError).toBeUndefined(); - expect(content).toContain('/Title(arXiv:astro-ph/9301001v1 7 Jan 1993)>>endobj'); - }, 60000); // 60 seconds - -}); diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts deleted file mode 100644 index b3d4a244..00000000 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts +++ /dev/null @@ -1,192 +0,0 @@ -import { removeUnwantedElements } from "../removeUnwantedElements"; -import { PageOptions } from "../../../../lib/entities"; - -describe('removeUnwantedElements', () => { - it('should remove script, style, iframe, noscript, meta, and head tags', () => { - const html = `Test
Content
`; - const options: PageOptions = {}; - const result = removeUnwantedElements(html, options); - expect(result).not.toContain('