diff --git a/apps/api/package.json b/apps/api/package.json index aebd90a5..0da99459 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -113,6 +113,7 @@ "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", "typesense": "^1.5.4", + "undici": "^6.20.1", "unstructured-client": "^0.11.3", "uuid": "^10.0.0", "winston": "^3.14.2", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index f98055fb..c2a9c8a3 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -197,6 +197,9 @@ importers: typesense: specifier: ^1.5.4 version: 1.8.2(@babel/runtime@7.24.6) + undici: + specifier: ^6.20.1 + version: 6.20.1 unstructured-client: specifier: ^0.11.3 version: 0.11.3(zod@3.23.8) @@ -3957,6 +3960,10 @@ packages: undici-types@5.26.5: resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} + undici@6.20.1: + resolution: {integrity: sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==} + engines: {node: '>=18.17'} + union@0.5.0: resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==} engines: {node: '>= 0.8.0'} @@ -8341,6 +8348,8 @@ snapshots: undici-types@5.26.5: {} + undici@6.20.1: {} + union@0.5.0: dependencies: qs: 6.12.2 diff --git a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts index f8196ccd..9881fae7 100644 --- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts @@ -1,6 +1,6 @@ import { Meta } from "../.."; import { EngineScrapeResult } from ".."; -import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; +import { downloadFile } from "../utils/downloadFile"; import mammoth from "mammoth"; export async function scrapeDOCX(meta: Meta): Promise { diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index d0591b57..bdc916e0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -62,7 +62,7 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis schema: z.object({ markdown: z.string(), }), - tryCount: 16, + tryCount: 32, tryCooldown: 250, }); diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts index 8db8892b..736faba7 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts @@ -4,6 +4,7 @@ import { createWriteStream, promises as fs } from "node:fs"; import { EngineError } from "../../error"; import { Writable } from "stream"; import { v4 as uuid } from "uuid"; +import * as undici from "undici"; export async function fetchFileToBuffer(url: string): Promise<{ response: Response, @@ -17,13 +18,21 @@ export async function fetchFileToBuffer(url: string): Promise<{ } export async function downloadFile(id: string, url: string): Promise<{ - response: Response + response: undici.Response tempFilePath: string }> { const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`); const tempFileWrite = createWriteStream(tempFilePath); - const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying + // TODO: maybe we could use tlsclient for this? for proxying + // use undici to ignore SSL for now + const response = await undici.fetch(url, { + dispatcher: new undici.Agent({ + connect: { + rejectUnauthorized: false, + }, + }) + }); // This should never happen in the current state of JS (2024), but let's check anyways. if (response.body === null) {