Cleaned up

This commit is contained in:
rafaelsideguide 2024-05-13 16:13:10 -03:00
parent f4348024c6
commit 8eb2e95f19
3 changed files with 18 additions and 65 deletions

View File

@ -10,7 +10,7 @@ import { SitemapEntry, fetchSitemapData, getLinksFromSitemap } from "./sitemap";
import { WebCrawler } from "./crawler";
import { getValue, setValue } from "../../services/redis";
import { getImageDescription } from "./utils/imageDescription";
import { fetchAndProcessPdf, isUrlAPdf } from "./utils/pdfProcessor";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
import {
replaceImgPathsWithAbsolutePaths,
replacePathsWithAbsolutePaths,
@ -144,11 +144,7 @@ export class WebScraperDataProvider {
return this.returnOnlyUrlsResponse(links, inProgress);
}
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
let documents = await this.processLinks(links, inProgress);
// documents.push(...pdfDocuments);
return this.cacheAndFinalizeDocuments(documents, links);
}
@ -156,11 +152,8 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
const links = this.urls;
// const [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
let documents = await this.processLinks(links, inProgress);
// documents.push(...pdfDocuments);
return documents;
}
@ -172,11 +165,7 @@ export class WebScraperDataProvider {
return this.returnOnlyUrlsResponse(links, inProgress);
}
// let [pdfLinks, notPdfLinks] = await this.splitPdfLinks(links);
// const pdfDocuments = await this.fetchPdfDocuments(pdfLinks);
let documents = await this.processLinks(links, inProgress);
// documents.push(...pdfDocuments);
return this.cacheAndFinalizeDocuments(documents, links);
}
@ -233,19 +222,6 @@ export class WebScraperDataProvider {
);
}
private async splitPdfLinks(links: string[]): Promise<[string[], string[]]> {
const checks = links.map(async (link) => ({
link,
isPdf: await isUrlAPdf({ url: link })
}));
const results = await Promise.all(checks);
const pdfLinks = results.filter(result => result.isPdf).map(result => result.link);
const notPdfLinks = results.filter(result => !result.isPdf).map(result => result.link);
return [pdfLinks, notPdfLinks];
}
private applyPathReplacements(documents: Document[]): Document[] {
return this.replaceAllPathsWithAbsolutePaths
? replacePathsWithAbsolutePaths(documents)

View File

@ -67,13 +67,11 @@ export async function scrapWithScrapingBee(
);
return "";
}
// Check the content type of the response
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
// Handle PDF content type
return fetchAndProcessPdf(url);
} else {
// Assume the content is text and decode it
const decoder = new TextDecoder();
const text = decoder.decode(response.data);
return text;
@ -104,9 +102,14 @@ export async function scrapWithPlaywright(url: string): Promise<string> {
return "";
}
const data = await response.json();
const html = data.content;
return html ?? "";
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
} else {
const data = await response.json();
const html = data.content;
return html ?? "";
}
} catch (error) {
console.error(`Error scraping with Puppeteer: ${error}`);
return "";
@ -173,7 +176,13 @@ export async function scrapSingleUrl(
);
return "";
}
text = await response.text();
const contentType = response.headers['content-type'];
if (contentType && contentType.includes('application/pdf')) {
return fetchAndProcessPdf(url);
} else {
text = await response.text();
}
} catch (error) {
console.error(`Error scraping URL: ${error}`);
return "";

View File

@ -105,36 +105,4 @@ async function processPdf(file: string) {
const fileContent = fs.readFileSync(file);
const data = await pdf(fileContent);
return data.text;
}
/**
* Check if a url is a pdf
* @param url The url to check
* @param fastMode If true, the function will return false if the url is does not end with .pdf
* @returns A promise that resolves to true if the url is a pdf, false otherwise
*/
export async function isUrlAPdf({
url,
fastMode = false,
}: {
url: string;
fastMode?: boolean;
}): Promise<boolean> {
try {
if (url.endsWith(".pdf")) {
return true;
}
// If fast mode is enabled, we skip the HEAD request and return false
if (fastMode) {
return false;
}
const before = Date.now();
const response = await axios.head(url);
const after = Date.now();
console.log(`${after - before}ms - HEAD Request for ${url}`);
const contentType = response.headers['content-type'];
return contentType.includes('application/pdf');
} catch (error) {
// console.error("Error making HEAD request:", error);
return false;
}
}
}