fix(scrapeURL/pdf): fix llamaparse upload

This commit is contained in:
Gergő Móricz 2024-11-12 20:55:14 +01:00
parent 687ea69621
commit 9ace2ad071
4 changed files with 26 additions and 16 deletions

View File

@ -81,7 +81,6 @@
"escape-html": "^1.0.3", "escape-html": "^1.0.3",
"express-rate-limit": "^7.3.1", "express-rate-limit": "^7.3.1",
"express-ws": "^5.0.2", "express-ws": "^5.0.2",
"form-data": "^4.0.0",
"glob": "^10.4.2", "glob": "^10.4.2",
"gpt3-tokenizer": "^1.1.5", "gpt3-tokenizer": "^1.1.5",
"ioredis": "^5.4.1", "ioredis": "^5.4.1",

View File

@ -101,9 +101,6 @@ importers:
express-ws: express-ws:
specifier: ^5.0.2 specifier: ^5.0.2
version: 5.0.2(express@4.19.2) version: 5.0.2(express@4.19.2)
form-data:
specifier: ^4.0.0
version: 4.0.0
glob: glob:
specifier: ^10.4.2 specifier: ^10.4.2
version: 10.4.2 version: 10.4.2
@ -3932,8 +3929,8 @@ packages:
engines: {node: '>=14.17'} engines: {node: '>=14.17'}
hasBin: true hasBin: true
typescript@5.6.2: typescript@5.6.3:
resolution: {integrity: sha512-NW8ByodCSNCwZeghjN3o+JX5OFH0Ojg6sadjEKY4huZ52TqbJTJnDo5+Tw98lSy63NZvi4n+ez5m2u5d4PkZyw==} resolution: {integrity: sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==}
engines: {node: '>=14.17'} engines: {node: '>=14.17'}
hasBin: true hasBin: true
@ -7742,7 +7739,7 @@ snapshots:
csv-parse: 5.5.6 csv-parse: 5.5.6
gpt3-tokenizer: 1.1.5 gpt3-tokenizer: 1.1.5
openai: 3.3.0 openai: 3.3.0
typescript: 5.6.2 typescript: 5.6.3
uuid: 9.0.1 uuid: 9.0.1
zod: 3.23.8 zod: 3.23.8
transitivePeerDependencies: transitivePeerDependencies:
@ -8320,7 +8317,7 @@ snapshots:
typescript@5.4.5: {} typescript@5.4.5: {}
typescript@5.6.2: {} typescript@5.6.3: {}
typesense@1.8.2(@babel/runtime@7.24.6): typesense@1.8.2(@babel/runtime@7.24.6):
dependencies: dependencies:

View File

@ -1,5 +1,4 @@
import { createReadStream, promises as fs } from "node:fs"; import { createReadStream, promises as fs } from "node:fs";
import FormData from "form-data";
import { Meta } from "../.."; import { Meta } from "../..";
import { EngineScrapeResult } from ".."; import { EngineScrapeResult } from "..";
import * as marked from "marked"; import * as marked from "marked";
@ -16,10 +15,26 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath }); meta.logger.debug("Processing PDF document with LlamaIndex", { tempFilePath });
const uploadForm = new FormData(); const uploadForm = new FormData();
uploadForm.append("file", createReadStream(tempFilePath), {
filename: tempFilePath, // This is utterly stupid but it works! - mogery
contentType: "application/pdf", // NOTE: request.headers["Content-Type"]? uploadForm.append("file", {
}); [Symbol.toStringTag]: "Blob",
name: tempFilePath,
stream() {
return createReadStream(tempFilePath) as unknown as ReadableStream<Uint8Array>
},
arrayBuffer() {
throw Error("Unimplemented in mock Blob: arrayBuffer")
},
size: (await fs.stat(tempFilePath)).size,
text() {
throw Error("Unimplemented in mock Blob: text")
},
slice(start, end, contentType) {
throw Error("Unimplemented in mock Blob: slice")
},
type: "application/pdf",
} as Blob);
const upload = await robustFetch({ const upload = await robustFetch({
url: "https://api.cloud.llamaindex.ai/api/parsing/upload", url: "https://api.cloud.llamaindex.ai/api/parsing/upload",

View File

@ -2,7 +2,6 @@ import { Logger } from "winston";
import { z, ZodError } from "zod"; import { z, ZodError } from "zod";
import { v4 as uuid } from "uuid"; import { v4 as uuid } from "uuid";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import FormData from "form-data";
export type RobustFetchParams<Schema extends z.Schema<any>> = { export type RobustFetchParams<Schema extends z.Schema<any>> = {
url: string; url: string;
@ -38,14 +37,14 @@ export async function robustFetch<Schema extends z.Schema<any>, Output = z.infer
method, method,
headers: { headers: {
...(body instanceof FormData ...(body instanceof FormData
? body.getHeaders() ? ({})
: body !== undefined ? ({ : body !== undefined ? ({
"Content-Type": "application/json", "Content-Type": "application/json",
}) : {}), }) : {}),
...(headers !== undefined ? headers : {}), ...(headers !== undefined ? headers : {}),
}, },
...(body instanceof FormData ? ({ ...(body instanceof FormData ? ({
body: body.getBuffer(), body,
}) : body !== undefined ? ({ }) : body !== undefined ? ({
body: JSON.stringify(body), body: JSON.stringify(body),
}) : {}), }) : {}),