mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
bugfix for pdfs and logging pdf events, also added trycatchs for docx
This commit is contained in:
parent
4c9d62f6d3
commit
49e3e64787
|
@ -20,6 +20,7 @@ import { getWebScraperQueue } from "../../../src/services/queue-service";
|
||||||
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
import { fetchAndProcessDocx } from "./utils/docxProcessor";
|
||||||
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
|
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||||
|
|
||||||
export class WebScraperDataProvider {
|
export class WebScraperDataProvider {
|
||||||
private jobId: string;
|
private jobId: string;
|
||||||
|
@ -316,10 +317,28 @@ export class WebScraperDataProvider {
|
||||||
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
|
||||||
return Promise.all(
|
return Promise.all(
|
||||||
pdfLinks.map(async (pdfLink) => {
|
pdfLinks.map(async (pdfLink) => {
|
||||||
|
const timer = Date.now();
|
||||||
|
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||||
|
type: "scrape",
|
||||||
|
url: pdfLink,
|
||||||
|
worker: process.env.FLY_MACHINE_ID,
|
||||||
|
method: "pdf-scrape",
|
||||||
|
result: null,
|
||||||
|
});
|
||||||
|
|
||||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||||
pdfLink,
|
pdfLink,
|
||||||
this.pageOptions.parsePDF
|
this.pageOptions.parsePDF
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const insertedLogId = await logInsertPromise;
|
||||||
|
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||||
|
response_size: content.length,
|
||||||
|
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||||
|
error: pageError,
|
||||||
|
response_code: pageStatusCode,
|
||||||
|
time_taken: Date.now() - timer,
|
||||||
|
});
|
||||||
return {
|
return {
|
||||||
content: content,
|
content: content,
|
||||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||||
|
@ -330,12 +349,32 @@ export class WebScraperDataProvider {
|
||||||
}
|
}
|
||||||
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
|
||||||
return Promise.all(
|
return Promise.all(
|
||||||
docxLinks.map(async (p) => {
|
docxLinks.map(async (docxLink) => {
|
||||||
const { content, pageStatusCode, pageError } =
|
const timer = Date.now();
|
||||||
await fetchAndProcessDocx(p);
|
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
|
||||||
|
type: "scrape",
|
||||||
|
url: docxLink,
|
||||||
|
worker: process.env.FLY_MACHINE_ID,
|
||||||
|
method: "docx-scrape",
|
||||||
|
result: null,
|
||||||
|
});
|
||||||
|
|
||||||
|
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
|
||||||
|
docxLink
|
||||||
|
);
|
||||||
|
|
||||||
|
const insertedLogId = await logInsertPromise;
|
||||||
|
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||||
|
response_size: content.length,
|
||||||
|
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
|
||||||
|
error: pageError,
|
||||||
|
response_code: pageStatusCode,
|
||||||
|
time_taken: Date.now() - timer,
|
||||||
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
content,
|
content,
|
||||||
metadata: { sourceURL: p, pageStatusCode, pageError },
|
metadata: { sourceURL: docxLink, pageStatusCode, pageError },
|
||||||
provider: "web-scraper",
|
provider: "web-scraper",
|
||||||
};
|
};
|
||||||
})
|
})
|
||||||
|
|
|
@ -4,15 +4,36 @@ import { createWriteStream } from "node:fs";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import os from "os";
|
import os from "os";
|
||||||
import mammoth from "mammoth";
|
import mammoth from "mammoth";
|
||||||
|
import { Logger } from "../../../lib/logger";
|
||||||
|
|
||||||
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
|
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
|
||||||
const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url);
|
let tempFilePath = '';
|
||||||
const content = await processDocxToText(tempFilePath);
|
let pageStatusCode = 200;
|
||||||
|
let pageError = '';
|
||||||
|
let content = '';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const downloadResult = await downloadDocx(url);
|
||||||
|
tempFilePath = downloadResult.tempFilePath;
|
||||||
|
pageStatusCode = downloadResult.pageStatusCode;
|
||||||
|
pageError = downloadResult.pageError;
|
||||||
|
content = await processDocxToText(tempFilePath);
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Failed to fetch and process DOCX: ${error.message}`);
|
||||||
|
pageStatusCode = 500;
|
||||||
|
pageError = error.message;
|
||||||
|
content = '';
|
||||||
|
} finally {
|
||||||
|
if (tempFilePath) {
|
||||||
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return { content, pageStatusCode, pageError };
|
return { content, pageStatusCode, pageError };
|
||||||
}
|
}
|
||||||
|
|
||||||
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
|
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
|
||||||
|
try {
|
||||||
const response = await axios({
|
const response = await axios({
|
||||||
url,
|
url,
|
||||||
method: "GET",
|
method: "GET",
|
||||||
|
@ -26,16 +47,33 @@ async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageSt
|
||||||
|
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
||||||
writer.on("error", reject);
|
writer.on("error", () => {
|
||||||
|
Logger.error('Failed to write DOCX file to disk');
|
||||||
|
reject(new Error('Failed to write DOCX file to disk'));
|
||||||
});
|
});
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Failed to download DOCX: ${error.message}`);
|
||||||
|
return { tempFilePath: "", pageStatusCode: 500, pageError: error.message };
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function processDocxToText(filePath: string): Promise<string> {
|
export async function processDocxToText(filePath: string): Promise<string> {
|
||||||
|
try {
|
||||||
const content = await extractTextFromDocx(filePath);
|
const content = await extractTextFromDocx(filePath);
|
||||||
return content;
|
return content;
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Failed to process DOCX to text: ${error.message}`);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function extractTextFromDocx(filePath: string): Promise<string> {
|
async function extractTextFromDocx(filePath: string): Promise<string> {
|
||||||
|
try {
|
||||||
const result = await mammoth.extractRawText({ path: filePath });
|
const result = await mammoth.extractRawText({ path: filePath });
|
||||||
return result.value;
|
return result.value;
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Failed to extract text from DOCX: ${error.message}`);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,7 +76,6 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
||||||
let attempt = 0;
|
let attempt = 0;
|
||||||
const maxAttempts = 10; // Maximum number of attempts
|
const maxAttempts = 10; // Maximum number of attempts
|
||||||
let resultAvailable = false;
|
let resultAvailable = false;
|
||||||
|
|
||||||
while (attempt < maxAttempts && !resultAvailable) {
|
while (attempt < maxAttempts && !resultAvailable) {
|
||||||
try {
|
try {
|
||||||
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
|
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
|
||||||
|
@ -90,13 +89,22 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug("Error fetching result w/ LlamaIndex");
|
Logger.debug("Error fetching result w/ LlamaIndex");
|
||||||
attempt++;
|
attempt++;
|
||||||
|
if (attempt >= maxAttempts) {
|
||||||
|
Logger.error("Max attempts reached, unable to fetch result.");
|
||||||
|
break; // Exit the loop if max attempts are reached
|
||||||
|
}
|
||||||
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
||||||
// You may want to handle specific errors differently
|
// You may want to handle specific errors differently
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!resultAvailable) {
|
if (!resultAvailable) {
|
||||||
|
try {
|
||||||
content = await processPdf(filePath);
|
content = await processPdf(filePath);
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Failed to process PDF: ${error}`);
|
||||||
|
content = "";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
content = resultResponse.data[resultType];
|
content = resultResponse.data[resultType];
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -104,15 +112,29 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
||||||
content = await processPdf(filePath);
|
content = await processPdf(filePath);
|
||||||
}
|
}
|
||||||
} else if (parsePDF) {
|
} else if (parsePDF) {
|
||||||
|
try {
|
||||||
content = await processPdf(filePath);
|
content = await processPdf(filePath);
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Failed to process PDF: ${error}`);
|
||||||
|
content = "";
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
|
try {
|
||||||
content = fs.readFileSync(filePath, "utf-8");
|
content = fs.readFileSync(filePath, "utf-8");
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Failed to read PDF file: ${error}`);
|
||||||
|
content = "";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function processPdf(file: string) {
|
async function processPdf(file: string) {
|
||||||
|
try {
|
||||||
const fileContent = fs.readFileSync(file);
|
const fileContent = fs.readFileSync(file);
|
||||||
const data = await pdf(fileContent);
|
const data = await pdf(fileContent);
|
||||||
return data.text;
|
return data.text;
|
||||||
|
} catch (error) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user