mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-15 19:22:19 +08:00
fix(scrapeURL/pdf,docx): ignore SSL when downloading PDF
This commit is contained in:
parent
7081beff1f
commit
16e850288c
|
@ -113,6 +113,7 @@
|
||||||
"turndown": "^7.1.3",
|
"turndown": "^7.1.3",
|
||||||
"turndown-plugin-gfm": "^1.0.2",
|
"turndown-plugin-gfm": "^1.0.2",
|
||||||
"typesense": "^1.5.4",
|
"typesense": "^1.5.4",
|
||||||
|
"undici": "^6.20.1",
|
||||||
"unstructured-client": "^0.11.3",
|
"unstructured-client": "^0.11.3",
|
||||||
"uuid": "^10.0.0",
|
"uuid": "^10.0.0",
|
||||||
"winston": "^3.14.2",
|
"winston": "^3.14.2",
|
||||||
|
|
|
@ -197,6 +197,9 @@ importers:
|
||||||
typesense:
|
typesense:
|
||||||
specifier: ^1.5.4
|
specifier: ^1.5.4
|
||||||
version: 1.8.2(@babel/runtime@7.24.6)
|
version: 1.8.2(@babel/runtime@7.24.6)
|
||||||
|
undici:
|
||||||
|
specifier: ^6.20.1
|
||||||
|
version: 6.20.1
|
||||||
unstructured-client:
|
unstructured-client:
|
||||||
specifier: ^0.11.3
|
specifier: ^0.11.3
|
||||||
version: 0.11.3(zod@3.23.8)
|
version: 0.11.3(zod@3.23.8)
|
||||||
|
@ -3957,6 +3960,10 @@ packages:
|
||||||
undici-types@5.26.5:
|
undici-types@5.26.5:
|
||||||
resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
|
resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==}
|
||||||
|
|
||||||
|
undici@6.20.1:
|
||||||
|
resolution: {integrity: sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==}
|
||||||
|
engines: {node: '>=18.17'}
|
||||||
|
|
||||||
union@0.5.0:
|
union@0.5.0:
|
||||||
resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==}
|
resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==}
|
||||||
engines: {node: '>= 0.8.0'}
|
engines: {node: '>= 0.8.0'}
|
||||||
|
@ -8341,6 +8348,8 @@ snapshots:
|
||||||
|
|
||||||
undici-types@5.26.5: {}
|
undici-types@5.26.5: {}
|
||||||
|
|
||||||
|
undici@6.20.1: {}
|
||||||
|
|
||||||
union@0.5.0:
|
union@0.5.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
qs: 6.12.2
|
qs: 6.12.2
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { Meta } from "../..";
|
import { Meta } from "../..";
|
||||||
import { EngineScrapeResult } from "..";
|
import { EngineScrapeResult } from "..";
|
||||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
import { downloadFile } from "../utils/downloadFile";
|
||||||
import mammoth from "mammoth";
|
import mammoth from "mammoth";
|
||||||
|
|
||||||
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
|
|
|
@ -62,7 +62,7 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
||||||
schema: z.object({
|
schema: z.object({
|
||||||
markdown: z.string(),
|
markdown: z.string(),
|
||||||
}),
|
}),
|
||||||
tryCount: 16,
|
tryCount: 32,
|
||||||
tryCooldown: 250,
|
tryCooldown: 250,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ import { createWriteStream, promises as fs } from "node:fs";
|
||||||
import { EngineError } from "../../error";
|
import { EngineError } from "../../error";
|
||||||
import { Writable } from "stream";
|
import { Writable } from "stream";
|
||||||
import { v4 as uuid } from "uuid";
|
import { v4 as uuid } from "uuid";
|
||||||
|
import * as undici from "undici";
|
||||||
|
|
||||||
export async function fetchFileToBuffer(url: string): Promise<{
|
export async function fetchFileToBuffer(url: string): Promise<{
|
||||||
response: Response,
|
response: Response,
|
||||||
|
@ -17,13 +18,21 @@ export async function fetchFileToBuffer(url: string): Promise<{
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function downloadFile(id: string, url: string): Promise<{
|
export async function downloadFile(id: string, url: string): Promise<{
|
||||||
response: Response
|
response: undici.Response
|
||||||
tempFilePath: string
|
tempFilePath: string
|
||||||
}> {
|
}> {
|
||||||
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
|
const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`);
|
||||||
const tempFileWrite = createWriteStream(tempFilePath);
|
const tempFileWrite = createWriteStream(tempFilePath);
|
||||||
|
|
||||||
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
// TODO: maybe we could use tlsclient for this? for proxying
|
||||||
|
// use undici to ignore SSL for now
|
||||||
|
const response = await undici.fetch(url, {
|
||||||
|
dispatcher: new undici.Agent({
|
||||||
|
connect: {
|
||||||
|
rejectUnauthorized: false,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
// This should never happen in the current state of JS (2024), but let's check anyways.
|
// This should never happen in the current state of JS (2024), but let's check anyways.
|
||||||
if (response.body === null) {
|
if (response.body === null) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user