bugfix for pdfs and logging pdf events, also added trycatchs for docx

This commit is contained in:
rafaelsideguide 2024-07-29 14:13:46 -03:00
parent 4c9d62f6d3
commit 49e3e64787
3 changed files with 129 additions and 30 deletions

View File

@ -20,6 +20,7 @@ import { getWebScraperQueue } from "../../../src/services/queue-service";
import { fetchAndProcessDocx } from "./utils/docxProcessor"; import { fetchAndProcessDocx } from "./utils/docxProcessor";
import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils"; import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils";
import { Logger } from "../../lib/logger"; import { Logger } from "../../lib/logger";
import { ScrapeEvents } from "../../lib/scrape-events";
export class WebScraperDataProvider { export class WebScraperDataProvider {
private jobId: string; private jobId: string;
@ -316,10 +317,28 @@ export class WebScraperDataProvider {
private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> { private async fetchPdfDocuments(pdfLinks: string[]): Promise<Document[]> {
return Promise.all( return Promise.all(
pdfLinks.map(async (pdfLink) => { pdfLinks.map(async (pdfLink) => {
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(this.jobId, {
type: "scrape",
url: pdfLink,
worker: process.env.FLY_MACHINE_ID,
method: "pdf-scrape",
result: null,
});
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
pdfLink, pdfLink,
this.pageOptions.parsePDF this.pageOptions.parsePDF
); );
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: content.length,
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
error: pageError,
response_code: pageStatusCode,
time_taken: Date.now() - timer,
});
return { return {
content: content, content: content,
metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
@ -330,12 +349,32 @@ export class WebScraperDataProvider {
} }
private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> { private async fetchDocxDocuments(docxLinks: string[]): Promise<Document[]> {
return Promise.all( return Promise.all(
docxLinks.map(async (p) => { docxLinks.map(async (docxLink) => {
const { content, pageStatusCode, pageError } = const timer = Date.now();
await fetchAndProcessDocx(p); const logInsertPromise = ScrapeEvents.insert(this.jobId, {
type: "scrape",
url: docxLink,
worker: process.env.FLY_MACHINE_ID,
method: "docx-scrape",
result: null,
});
const { content, pageStatusCode, pageError } = await fetchAndProcessDocx(
docxLink
);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: content.length,
success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100),
error: pageError,
response_code: pageStatusCode,
time_taken: Date.now() - timer,
});
return { return {
content, content,
metadata: { sourceURL: p, pageStatusCode, pageError }, metadata: { sourceURL: docxLink, pageStatusCode, pageError },
provider: "web-scraper", provider: "web-scraper",
}; };
}) })

View File

@ -4,15 +4,36 @@ import { createWriteStream } from "node:fs";
import path from "path"; import path from "path";
import os from "os"; import os from "os";
import mammoth from "mammoth"; import mammoth from "mammoth";
import { Logger } from "../../../lib/logger";
export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> { export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> {
const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url); let tempFilePath = '';
const content = await processDocxToText(tempFilePath); let pageStatusCode = 200;
let pageError = '';
let content = '';
try {
const downloadResult = await downloadDocx(url);
tempFilePath = downloadResult.tempFilePath;
pageStatusCode = downloadResult.pageStatusCode;
pageError = downloadResult.pageError;
content = await processDocxToText(tempFilePath);
} catch (error) {
Logger.error(`Failed to fetch and process DOCX: ${error.message}`);
pageStatusCode = 500;
pageError = error.message;
content = '';
} finally {
if (tempFilePath) {
fs.unlinkSync(tempFilePath); // Clean up the temporary file fs.unlinkSync(tempFilePath); // Clean up the temporary file
}
}
return { content, pageStatusCode, pageError }; return { content, pageStatusCode, pageError };
} }
async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> { async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> {
try {
const response = await axios({ const response = await axios({
url, url,
method: "GET", method: "GET",
@ -26,16 +47,33 @@ async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageSt
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined })); writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
writer.on("error", reject); writer.on("error", () => {
Logger.error('Failed to write DOCX file to disk');
reject(new Error('Failed to write DOCX file to disk'));
}); });
});
} catch (error) {
Logger.error(`Failed to download DOCX: ${error.message}`);
return { tempFilePath: "", pageStatusCode: 500, pageError: error.message };
}
} }
export async function processDocxToText(filePath: string): Promise<string> { export async function processDocxToText(filePath: string): Promise<string> {
try {
const content = await extractTextFromDocx(filePath); const content = await extractTextFromDocx(filePath);
return content; return content;
} catch (error) {
Logger.error(`Failed to process DOCX to text: ${error.message}`);
return "";
}
} }
async function extractTextFromDocx(filePath: string): Promise<string> { async function extractTextFromDocx(filePath: string): Promise<string> {
try {
const result = await mammoth.extractRawText({ path: filePath }); const result = await mammoth.extractRawText({ path: filePath });
return result.value; return result.value;
} catch (error) {
Logger.error(`Failed to extract text from DOCX: ${error.message}`);
return "";
}
} }

View File

@ -76,7 +76,6 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
let attempt = 0; let attempt = 0;
const maxAttempts = 10; // Maximum number of attempts const maxAttempts = 10; // Maximum number of attempts
let resultAvailable = false; let resultAvailable = false;
while (attempt < maxAttempts && !resultAvailable) { while (attempt < maxAttempts && !resultAvailable) {
try { try {
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) }); resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
@ -90,13 +89,22 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
} catch (error) { } catch (error) {
Logger.debug("Error fetching result w/ LlamaIndex"); Logger.debug("Error fetching result w/ LlamaIndex");
attempt++; attempt++;
if (attempt >= maxAttempts) {
Logger.error("Max attempts reached, unable to fetch result.");
break; // Exit the loop if max attempts are reached
}
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
// You may want to handle specific errors differently // You may want to handle specific errors differently
} }
} }
if (!resultAvailable) { if (!resultAvailable) {
try {
content = await processPdf(filePath); content = await processPdf(filePath);
} catch (error) {
Logger.error(`Failed to process PDF: ${error}`);
content = "";
}
} }
content = resultResponse.data[resultType]; content = resultResponse.data[resultType];
} catch (error) { } catch (error) {
@ -104,15 +112,29 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
content = await processPdf(filePath); content = await processPdf(filePath);
} }
} else if (parsePDF) { } else if (parsePDF) {
try {
content = await processPdf(filePath); content = await processPdf(filePath);
} catch (error) {
Logger.error(`Failed to process PDF: ${error}`);
content = "";
}
} else { } else {
try {
content = fs.readFileSync(filePath, "utf-8"); content = fs.readFileSync(filePath, "utf-8");
} catch (error) {
Logger.error(`Failed to read PDF file: ${error}`);
content = "";
}
} }
return content; return content;
} }
async function processPdf(file: string) { async function processPdf(file: string) {
try {
const fileContent = fs.readFileSync(file); const fileContent = fs.readFileSync(file);
const data = await pdf(fileContent); const data = await pdf(fileContent);
return data.text; return data.text;
} catch (error) {
throw error;
}
} }