testing crawl with new.abb.com case

many unnecessary console.logs for tracing the code execution
This commit is contained in:
rafaelsideguide 2024-06-24 16:25:07 -03:00
parent 3c7b7e7242
commit 21d29de819
12 changed files with 128 additions and 35 deletions

View File

@ -1,10 +1,10 @@
### Crawl Website ### Crawl Website
POST http://localhost:3002/v0/scrape HTTP/1.1 POST http://localhost:3002/v0/scrape HTTP/1.1
Authorization: Bearer Authorization: Bearer fc-
content-type: application/json content-type: application/json
{ {
"url":"https://docs.mendable.ai" "url":"new.abb.com/sustainability/foundation"
} }
@ -14,16 +14,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
### Scrape Website ### Scrape Website
POST http://localhost:3002/v0/crawl HTTP/1.1 POST http://localhost:3002/v0/crawl HTTP/1.1
Authorization: Bearer Authorization: Bearer fc-
content-type: application/json content-type: application/json
{ {
"url":"https://www.mendable.ai", "url": "new.abb.com/sustainability/foundation"
"crawlerOptions": {
"returnOnlyUrls": true
}
} }
## "reoveTags": [],
# "mode": "crawl",
# "crawlerOptions": {
# "allowBackwardCrawling": false
# },
# "pageOptions": {
# "onlyMainContent": false,
# "includeHtml": false,
# "parsePDF": true
# }

View File

@ -66,6 +66,7 @@ export async function crawlController(req: Request, res: Response) {
parsePDF: true parsePDF: true
}; };
console.log('1. here OK!')
if (mode === "single_urls" && !url.includes(",")) { if (mode === "single_urls" && !url.includes(",")) {
try { try {
const a = new WebScraperDataProvider(); const a = new WebScraperDataProvider();
@ -84,6 +85,7 @@ export async function crawlController(req: Request, res: Response) {
current_url: progress.currentDocumentUrl, current_url: progress.currentDocumentUrl,
}); });
}); });
console.log('crawlController - return res.json...')
return res.json({ return res.json({
success: true, success: true,
documents: docs, documents: docs,

View File

@ -1,5 +1,6 @@
export function parseMarkdown(html: string) { export function parseMarkdown(html: string) {
console.log('parseMarkdown - start!')
var TurndownService = require("turndown"); var TurndownService = require("turndown");
var turndownPluginGfm = require('joplin-turndown-plugin-gfm') var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
@ -50,6 +51,6 @@ export function parseMarkdown(html: string) {
/\[Skip to Content\]\(#[^\)]*\)/gi, /\[Skip to Content\]\(#[^\)]*\)/gi,
"" ""
); );
console.log('parseMarkdown - return')
return markdownContent; return markdownContent;
} }

View File

@ -78,9 +78,11 @@ export async function runWebScraper({
pageOptions: pageOptions, pageOptions: pageOptions,
}); });
} }
console.log('runWebScraper - getDocuments')
const docs = (await provider.getDocuments(false, (progress: Progress) => { const docs = (await provider.getDocuments(false, (progress: Progress) => {
inProgress(progress); inProgress(progress);
})) as Document[]; })) as Document[];
console.log('runWebScraper - getDocuments - done - docs.length:', docs.length)
if (docs.length === 0) { if (docs.length === 0) {
return { return {

View File

@ -129,24 +129,31 @@ export class WebCrawler {
): Promise<{ url: string, html: string }[]> { ): Promise<{ url: string, html: string }[]> {
// Fetch and parse robots.txt // Fetch and parse robots.txt
try { try {
const response = await axios.get(this.robotsTxtUrl); console.log('3.1 here OK')
console.log('this.robotsTxtUrl:', this.robotsTxtUrl)
const response = await axios.get(this.robotsTxtUrl, { timeout: 3000 });
console.log('????', {response})
console.log('3.2 here OK')
this.robots = robotsParser(this.robotsTxtUrl, response.data); this.robots = robotsParser(this.robotsTxtUrl, response.data);
} catch (error) { } catch (error) {
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
} }
console.log('4. here OK!')
if(!crawlerOptions?.ignoreSitemap){ if(!crawlerOptions?.ignoreSitemap){
console.log('4.1')
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
console.log('4.2')
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
console.log('4.3')
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
console.log('4.4')
return filteredLinks.map(link => ({ url: link, html: "" })); return filteredLinks.map(link => ({ url: link, html: "" }));
} }
} }
console.log('5. here OK!')
const urls = await this.crawlUrls( const urls = await this.crawlUrls(
[this.initialUrl], [this.initialUrl],
pageOptions, pageOptions,
@ -154,7 +161,7 @@ export class WebCrawler {
inProgress inProgress
); );
console.log('6. here OK!')
if ( if (
urls.length === 0 && urls.length === 0 &&
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
@ -164,6 +171,7 @@ export class WebCrawler {
// make sure to run include exclude here again // make sure to run include exclude here again
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
console.log('7. here OK!')
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
} }
@ -180,6 +188,7 @@ export class WebCrawler {
} }
return; return;
} }
console.log('crawlUrls - crawl')
const newUrls = await this.crawl(task, pageOptions); const newUrls = await this.crawl(task, pageOptions);
// add the initial url if not already added // add the initial url if not already added
// if (this.visited.size === 1) { // if (this.visited.size === 1) {
@ -192,7 +201,7 @@ export class WebCrawler {
// } // }
// } // }
console.log('---??---')
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
if (inProgress && newUrls.length > 0) { if (inProgress && newUrls.length > 0) {
@ -210,12 +219,14 @@ export class WebCrawler {
currentDocumentUrl: task, currentDocumentUrl: task,
}); });
} }
console.log('----???----')
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress); await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
if (callback && typeof callback === "function") { if (callback && typeof callback === "function") {
callback(); callback();
} }
}, concurrencyLimit); }, concurrencyLimit);
console.log('crawlUrls - queue.push')
queue.push( queue.push(
urls.filter( urls.filter(
(url) => (url) =>
@ -225,7 +236,9 @@ export class WebCrawler {
if (err) console.error(err); if (err) console.error(err);
} }
); );
console.log('crawlUrls - queue.drain')
await queue.drain(); await queue.drain();
console.log('crawlUrls - return')
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
} }
@ -253,16 +266,22 @@ export class WebCrawler {
// If it is the first link, fetch with single url // If it is the first link, fetch with single url
if (this.visited.size === 1) { if (this.visited.size === 1) {
console.log('crawl scrapSingleUrl...')
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
console.log('got a page! lets continue...')
content = page.html ?? ""; content = page.html ?? "";
pageStatusCode = page.metadata?.pageStatusCode; pageStatusCode = page.metadata?.pageStatusCode;
pageError = page.metadata?.pageError || undefined; pageError = page.metadata?.pageError || undefined;
} else { } else {
const response = await axios.get(url); // console.log('crawl - else')
const response = await axios.get(url, { timeout: 3000 });
console.log('crawl - else - response ok')
content = response.data ?? ""; content = response.data ?? "";
pageStatusCode = response.status; pageStatusCode = response.status;
pageError = response.statusText != "OK" ? response.statusText : undefined; pageError = response.statusText != "OK" ? response.statusText : undefined;
} }
console.log('crawl... keep going')
const $ = load(content); const $ = load(content);
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
@ -271,14 +290,17 @@ export class WebCrawler {
links.push({ url, html: content, pageStatusCode, pageError }); links.push({ url, html: content, pageStatusCode, pageError });
} }
console.log('crawl... keep going 2')
$("a").each((_, element) => { $("a").each((_, element) => {
const href = $(element).attr("href"); const href = $(element).attr("href");
if (href) { if (href) {
console.log('href:', href)
let fullUrl = href; let fullUrl = href;
if (!href.startsWith("http")) { if (!href.startsWith("http")) {
fullUrl = new URL(href, this.baseUrl).toString(); fullUrl = new URL(href, this.baseUrl).toString();
} }
const urlObj = new URL(fullUrl); const urlObj = new URL(fullUrl);
console.log('urlObj:', urlObj)
const path = urlObj.pathname; const path = urlObj.pathname;
@ -295,10 +317,13 @@ export class WebCrawler {
} }
} }
}); });
console.log('crawl... keep going 3')
if (this.visited.size === 1) { if (this.visited.size === 1) {
return links; return links;
} }
console.log('returning crawl...')
// Create a new list to return to avoid modifying the visited list // Create a new list to return to avoid modifying the visited list
return links.filter((link) => !this.visited.has(link.url)); return links.filter((link) => !this.visited.has(link.url));
} catch (error) { } catch (error) {
@ -385,6 +410,7 @@ export class WebCrawler {
// //
private async tryFetchSitemapLinks(url: string): Promise<string[]> { private async tryFetchSitemapLinks(url: string): Promise<string[]> {
console.log("4.1.1 - Normalizing URL");
const normalizeUrl = (url: string) => { const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) { if (url.endsWith("/")) {
@ -393,46 +419,48 @@ export class WebCrawler {
return url; return url;
}; };
console.log("4.1.2 - Constructing sitemap URL");
const sitemapUrl = url.endsWith("/sitemap.xml") const sitemapUrl = url.endsWith("/sitemap.xml")
? url ? url
: `${url}/sitemap.xml`; : `${url}/sitemap.xml`;
let sitemapLinks: string[] = []; let sitemapLinks: string[] = [];
console.log("4.1.3 - Fetching sitemap from constructed URL");
try { try {
const response = await axios.get(sitemapUrl); const response = await axios.get(sitemapUrl, { timeout: 3000 });
if (response.status === 200) { if (response.status === 200) {
console.log("4.1.4 - Extracting links from sitemap");
sitemapLinks = await getLinksFromSitemap(sitemapUrl); sitemapLinks = await getLinksFromSitemap(sitemapUrl);
} }
} catch (error) { } catch (error) {
// Error handling for failed sitemap fetch console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
} }
if (sitemapLinks.length === 0) { if (sitemapLinks.length === 0) {
// If the first one doesn't work, try the base URL console.log("4.1.5 - Trying base URL sitemap as fallback");
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try { try {
const response = await axios.get(baseUrlSitemap); const response = await axios.get(baseUrlSitemap, { timeout: 3000 });
if (response.status === 200) { if (response.status === 200) {
console.log("4.1.6 - Extracting links from base URL sitemap");
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
} }
} catch (error) { } catch (error) {
// Error handling for failed base URL sitemap fetch console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
// console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
} }
} }
// Normalize and check if the URL is present in any of the sitemaps console.log("4.1.7 - Normalizing sitemap links");
const normalizedUrl = normalizeUrl(url); const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl console.log("4.1.8 - Checking if normalized URL is already included");
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
// do not push the normalized url console.log("4.1.9 - Adding initial URL to sitemap links");
sitemapLinks.push(url); sitemapLinks.push(url);
} }
console.log("4.1.10 - Returning sitemap links");
return sitemapLinks; return sitemapLinks;
} }
} }

View File

@ -63,11 +63,13 @@ export class WebScraperDataProvider {
await Promise.all( await Promise.all(
batchUrls.map(async (url, index) => { batchUrls.map(async (url, index) => {
const existingHTML = allHtmls ? allHtmls[i + index] : ""; const existingHTML = allHtmls ? allHtmls[i + index] : "";
console.log('convertUrlsToDocuments - scrapSingleUrl')
const result = await scrapSingleUrl( const result = await scrapSingleUrl(
url, url,
this.pageOptions, this.pageOptions,
existingHTML existingHTML
); );
console.log('convertUrlsToDocuments - result ok')
processedUrls++; processedUrls++;
if (inProgress) { if (inProgress) {
inProgress({ inProgress({
@ -98,6 +100,7 @@ export class WebScraperDataProvider {
return [] as Document[]; return [] as Document[];
} }
} }
console.log('returning results from convertUrlsToDocuments...')
return results.filter((result) => result !== null) as Document[]; return results.filter((result) => result !== null) as Document[];
} }
@ -106,7 +109,7 @@ export class WebScraperDataProvider {
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
this.validateInitialUrl(); this.validateInitialUrl();
console.log('2. here OK!')
if (!useCaching) { if (!useCaching) {
return this.processDocumentsWithoutCache(inProgress); return this.processDocumentsWithoutCache(inProgress);
} }
@ -175,6 +178,7 @@ export class WebScraperDataProvider {
allowBackwardCrawling: this.allowBackwardCrawling, allowBackwardCrawling: this.allowBackwardCrawling,
}); });
console.log('3. here OK!')
let links = await crawler.start( let links = await crawler.start(
inProgress, inProgress,
this.pageOptions, this.pageOptions,
@ -186,21 +190,28 @@ export class WebScraperDataProvider {
this.maxCrawledDepth this.maxCrawledDepth
); );
console.log("8 - Mapping URLs from links");
let allLinks = links.map((e) => e.url); let allLinks = links.map((e) => e.url);
console.log("9 - Mapping HTML content from links");
const allHtmls = links.map((e) => e.html); const allHtmls = links.map((e) => e.html);
console.log("10 - Checking if only URLs should be returned");
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {
return this.returnOnlyUrlsResponse(allLinks, inProgress); return this.returnOnlyUrlsResponse(allLinks, inProgress);
} }
let documents = []; let documents = [];
console.log("11 - Checking if crawler is in fast mode and HTML content is present");
// check if fast mode is enabled and there is html inside the links // check if fast mode is enabled and there is html inside the links
if (this.crawlerMode === "fast" && links.some((link) => link.html)) { if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
console.log("12 - Processing links with HTML content in fast mode");
documents = await this.processLinks(allLinks, inProgress, allHtmls); documents = await this.processLinks(allLinks, inProgress, allHtmls);
} else { } else {
console.log("13 - Processing links in normal mode");
documents = await this.processLinks(allLinks, inProgress); documents = await this.processLinks(allLinks, inProgress);
} }
console.log("14 - Caching and finalizing documents");
return this.cacheAndFinalizeDocuments(documents, allLinks); return this.cacheAndFinalizeDocuments(documents, allLinks);
} }
@ -259,14 +270,22 @@ export class WebScraperDataProvider {
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link)); links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
console.log('processLinks - convertUrlsToDocuments...')
let documents = await this.convertUrlsToDocuments( let documents = await this.convertUrlsToDocuments(
links, links,
inProgress, inProgress,
allHtmls allHtmls
); );
documents = await this.getSitemapData(this.urls[0], documents); console.log('processLinks - convertUrlsToDocuments - done')
console.log('processLinks - getSitemapData...')
documents = await this.getSitemapData(this.urls[0], documents);
console.log('processLinks - getSitemapData - done')
console.log('processLinks - applyPathReplacements...')
documents = this.applyPathReplacements(documents); documents = this.applyPathReplacements(documents);
console.log('processLinks - applyPathReplacements - done')
// documents = await this.applyImgAltText(documents); // documents = await this.applyImgAltText(documents);
if ( if (
@ -275,6 +294,7 @@ export class WebScraperDataProvider {
) { ) {
documents = await generateCompletions(documents, this.extractorOptions); documents = await generateCompletions(documents, this.extractorOptions);
} }
console.log('processLinks - returning...')
return documents.concat(pdfDocuments).concat(docxDocuments); return documents.concat(pdfDocuments).concat(docxDocuments);
} }
@ -320,8 +340,11 @@ export class WebScraperDataProvider {
documents: Document[], documents: Document[],
links: string[] links: string[]
): Promise<Document[]> { ): Promise<Document[]> {
console.log('cacheAndFinalizeDocuments - 1')
await this.setCachedDocuments(documents, links); await this.setCachedDocuments(documents, links);
console.log('cacheAndFinalizeDocuments - 2')
documents = this.removeChildLinks(documents); documents = this.removeChildLinks(documents);
console.log('cacheAndFinalizeDocuments - 3')
return documents.splice(0, this.limit); return documents.splice(0, this.limit);
} }

View File

@ -113,13 +113,25 @@ export async function scrapWithScrapingBee(
pageOptions: { parsePDF?: boolean } = { parsePDF: true } pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { ): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
try { try {
console.log("13. scrapWithScrapingBee - 1")
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
console.log("13. scrapWithScrapingBee - 2")
const clientParams = await generateRequestParams( const clientParams = await generateRequestParams(
url, url,
wait_browser, wait_browser,
timeout, timeout,
); );
console.log({ url,
wait_browser,
timeout })
console.log({
...clientParams,
params: {
...clientParams.params,
'transparent_status_code': 'True'
}
})
console.log("13. scrapWithScrapingBee - 3")
const response = await client.get({ const response = await client.get({
...clientParams, ...clientParams,
params: { params: {
@ -127,7 +139,7 @@ export async function scrapWithScrapingBee(
'transparent_status_code': 'True' 'transparent_status_code': 'True'
} }
}); });
console.log("13. scrapWithScrapingBee - 4")
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return await fetchAndProcessPdf(url, pageOptions?.parsePDF); return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
@ -140,6 +152,7 @@ export async function scrapWithScrapingBee(
} catch (decodeError) { } catch (decodeError) {
console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`); console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`);
} }
console.log("13. scrapWithScrapingBee - 5 - returning ok")
return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }; return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined };
} }
} catch (error) { } catch (error) {
@ -396,8 +409,13 @@ export async function scrapSingleUrl(
screenshot = customScrapedContent.screenshot; screenshot = customScrapedContent.screenshot;
} }
console.log(
'chegou aqui'
)
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
console.log('cleanedHtml')
return { return {
text: await parseMarkdown(cleanedHtml), text: await parseMarkdown(cleanedHtml),
@ -432,7 +450,9 @@ export async function scrapSingleUrl(
break; break;
} }
console.log('attemptScraping - 1')
const attempt = await attemptScraping(urlToScrap, scraper); const attempt = await attemptScraping(urlToScrap, scraper);
console.log('attemptScraping - 2 - return ok')
text = attempt.text ?? ''; text = attempt.text ?? '';
html = attempt.html ?? ''; html = attempt.html ?? '';
screenshot = attempt.screenshot ?? ''; screenshot = attempt.screenshot ?? '';
@ -451,6 +471,7 @@ export async function scrapSingleUrl(
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
} }
} }
console.log('ok... here we are...')
if (!text) { if (!text) {
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`); throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
@ -487,6 +508,7 @@ export async function scrapSingleUrl(
}; };
} }
console.log('returning document...')
return document; return document;
} catch (error) { } catch (error) {
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);

View File

@ -8,7 +8,7 @@ export async function getLinksFromSitemap(
try { try {
let content: string; let content: string;
try { try {
const response = await axios.get(sitemapUrl); const response = await axios.get(sitemapUrl, { timeout: 3000 });
content = response.data; content = response.data;
} catch (error) { } catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`); console.error(`Request failed for ${sitemapUrl}: ${error}`);
@ -42,7 +42,7 @@ export async function getLinksFromSitemap(
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => { export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try { try {
const response = await axios.get(sitemapUrl); const response = await axios.get(sitemapUrl, { timeout: 3000 });
if (response.status === 200) { if (response.status === 200) {
const xml = response.data; const xml = response.data;
const parsedXml = await parseStringPromise(xml); const parsedXml = await parseStringPromise(xml);

View File

@ -43,6 +43,10 @@ export function isUrlBlocked(url: string): boolean {
} }
try { try {
if (!url.startsWith('http://') && !url.startsWith('https://')) {
url = 'https://' + url;
}
const urlObj = new URL(url); const urlObj = new URL(url);
const hostname = urlObj.hostname.toLowerCase(); const hostname = urlObj.hostname.toLowerCase();

View File

@ -71,7 +71,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
while (attempt < maxAttempts && !resultAvailable) { while (attempt < maxAttempts && !resultAvailable) {
try { try {
resultResponse = await axios.get(resultUrl, { headers }); resultResponse = await axios.get(resultUrl, { headers, timeout: 6000 });
if (resultResponse.status === 200) { if (resultResponse.status === 200) {
resultAvailable = true; // Exit condition met resultAvailable = true; // Exit condition met
} else { } else {

View File

@ -4,7 +4,7 @@ export async function attemptScrapWithRequests(
urlToScrap: string urlToScrap: string
): Promise<string | null> { ): Promise<string | null> {
try { try {
const response = await axios.get(urlToScrap); const response = await axios.get(urlToScrap, { timeout: 15000 });
if (!response.data) { if (!response.data) {
console.log("Failed normal requests as well"); console.log("Failed normal requests as well");

View File

@ -14,6 +14,7 @@ if(process.env.ENV === 'production') {
getWebScraperQueue().process( getWebScraperQueue().process(
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
async function (job, done) { async function (job, done) {
console.log('getWebScraperQueue - start')
try { try {
job.progress({ job.progress({
current: 1, current: 1,
@ -22,11 +23,13 @@ getWebScraperQueue().process(
current_url: "", current_url: "",
}); });
const start = Date.now(); const start = Date.now();
console.log('getWebScraperQueue - startWebScraperPipeline')
const { success, message, docs } = await startWebScraperPipeline({ job }); const { success, message, docs } = await startWebScraperPipeline({ job });
console.log('getWebScraperQueue - startWebScraperPipeline - done')
const end = Date.now(); const end = Date.now();
const timeTakenInSeconds = (end - start) / 1000; const timeTakenInSeconds = (end - start) / 1000;
console.log('docs.length:', docs.length)
const data = { const data = {
success: success, success: success,
result: { result: {