fix(scrapeURL/pdf,docx): ignore SSL when downloading PDF

This commit is contained in:
Gergő Móricz 2024-11-12 22:46:58 +01:00
parent 7081beff1f
commit 16e850288c
5 changed files with 23 additions and 4 deletions

View File

@ -113,6 +113,7 @@
"turndown": "^7.1.3",
"turndown-plugin-gfm": "^1.0.2",
"typesense": "^1.5.4",
"undici": "^6.20.1",
"unstructured-client": "^0.11.3",
"uuid": "^10.0.0",
"winston": "^3.14.2",

View File

@ -197,6 +197,9 @@ importers:
typesense:
specifier: ^1.5.4
version: 1.8.2(@babel/runtime@7.24.6)
undici:
specifier: ^6.20.1
version: 6.20.1
unstructured-client:
specifier: ^0.11.3
version: 0.11.3(zod@3.23.8)
@ -3957,6 +3960,10 @@ packages:
undici-types@5.26.5:
resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
undici@6.20.1:
resolution: {integrity: sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==}
engines: {node: '>=18.17'}
union@0.5.0:
resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==}
engines: {node: '>= 0.8.0'}
@ -8341,6 +8348,8 @@ snapshots:
undici-types@5.26.5: {}
undici@6.20.1: {}
union@0.5.0:
dependencies:
qs: 6.12.2

View File

@ -1,6 +1,6 @@
import { Meta } from "../..";
import { EngineScrapeResult } from "..";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { downloadFile } from "../utils/downloadFile";
import mammoth from "mammoth";
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {

View File

@ -62,7 +62,7 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
schema: z.object({
markdown: z.string(),
}),
tryCount: 16,
tryCount: 32,
tryCooldown: 250,
});

View File

@ -4,6 +4,7 @@ import { createWriteStream, promises as fs } from "node:fs";
import { EngineError } from "../../error";
import { Writable } from "stream";
import { v4 as uuid } from "uuid";
import * as undici from "undici";
export async function fetchFileToBuffer(url: string): Promise<{
response: Response,
@ -17,13 +18,21 @@ export async function fetchFileToBuffer(url: string): Promise<{
}
export async function downloadFile(id: string, url: string): Promise<{
response: Response
response: undici.Response
tempFilePath: string
}> {
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
const tempFileWrite = createWriteStream(tempFilePath);
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
// TODO: maybe we could use tlsclient for this? for proxying
// use undici to ignore SSL for now
const response = await undici.fetch(url, {
dispatcher: new undici.Agent({
connect: {
rejectUnauthorized: false,
},
})
});
// This should never happen in the current state of JS (2024), but let's check anyways.
if (response.body === null) {