mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
testing crawl with new.abb.com case
many unnecessary console.logs for tracing the code execution
This commit is contained in:
parent
3c7b7e7242
commit
21d29de819
|
@ -1,10 +1,10 @@
|
||||||
### Crawl Website
|
### Crawl Website
|
||||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||||
Authorization: Bearer
|
Authorization: Bearer fc-
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
"url":"https://docs.mendable.ai"
|
"url":"new.abb.com/sustainability/foundation"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -14,16 +14,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
||||||
|
|
||||||
### Scrape Website
|
### Scrape Website
|
||||||
POST http://localhost:3002/v0/crawl HTTP/1.1
|
POST http://localhost:3002/v0/crawl HTTP/1.1
|
||||||
Authorization: Bearer
|
Authorization: Bearer fc-
|
||||||
content-type: application/json
|
content-type: application/json
|
||||||
|
|
||||||
{
|
{
|
||||||
"url":"https://www.mendable.ai",
|
"url": "new.abb.com/sustainability/foundation"
|
||||||
"crawlerOptions": {
|
|
||||||
"returnOnlyUrls": true
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
## "reoveTags": [],
|
||||||
|
# "mode": "crawl",
|
||||||
|
# "crawlerOptions": {
|
||||||
|
# "allowBackwardCrawling": false
|
||||||
|
# },
|
||||||
|
# "pageOptions": {
|
||||||
|
# "onlyMainContent": false,
|
||||||
|
# "includeHtml": false,
|
||||||
|
# "parsePDF": true
|
||||||
|
# }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -66,6 +66,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
parsePDF: true
|
parsePDF: true
|
||||||
};
|
};
|
||||||
|
|
||||||
|
console.log('1. here OK!')
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) {
|
||||||
try {
|
try {
|
||||||
const a = new WebScraperDataProvider();
|
const a = new WebScraperDataProvider();
|
||||||
|
@ -84,6 +85,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||||
current_url: progress.currentDocumentUrl,
|
current_url: progress.currentDocumentUrl,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
console.log('crawlController - return res.json...')
|
||||||
return res.json({
|
return res.json({
|
||||||
success: true,
|
success: true,
|
||||||
documents: docs,
|
documents: docs,
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
|
|
||||||
export function parseMarkdown(html: string) {
|
export function parseMarkdown(html: string) {
|
||||||
|
console.log('parseMarkdown - start!')
|
||||||
var TurndownService = require("turndown");
|
var TurndownService = require("turndown");
|
||||||
var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
|
var turndownPluginGfm = require('joplin-turndown-plugin-gfm')
|
||||||
|
|
||||||
|
@ -50,6 +51,6 @@ export function parseMarkdown(html: string) {
|
||||||
/\[Skip to Content\]\(#[^\)]*\)/gi,
|
/\[Skip to Content\]\(#[^\)]*\)/gi,
|
||||||
""
|
""
|
||||||
);
|
);
|
||||||
|
console.log('parseMarkdown - return')
|
||||||
return markdownContent;
|
return markdownContent;
|
||||||
}
|
}
|
||||||
|
|
|
@ -78,9 +78,11 @@ export async function runWebScraper({
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
console.log('runWebScraper - getDocuments')
|
||||||
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
const docs = (await provider.getDocuments(false, (progress: Progress) => {
|
||||||
inProgress(progress);
|
inProgress(progress);
|
||||||
})) as Document[];
|
})) as Document[];
|
||||||
|
console.log('runWebScraper - getDocuments - done - docs.length:', docs.length)
|
||||||
|
|
||||||
if (docs.length === 0) {
|
if (docs.length === 0) {
|
||||||
return {
|
return {
|
||||||
|
|
|
@ -129,24 +129,31 @@ export class WebCrawler {
|
||||||
): Promise<{ url: string, html: string }[]> {
|
): Promise<{ url: string, html: string }[]> {
|
||||||
// Fetch and parse robots.txt
|
// Fetch and parse robots.txt
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(this.robotsTxtUrl);
|
console.log('3.1 here OK')
|
||||||
|
console.log('this.robotsTxtUrl:', this.robotsTxtUrl)
|
||||||
|
const response = await axios.get(this.robotsTxtUrl, { timeout: 3000 });
|
||||||
|
console.log('????', {response})
|
||||||
|
console.log('3.2 here OK')
|
||||||
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
this.robots = robotsParser(this.robotsTxtUrl, response.data);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log('4. here OK!')
|
||||||
if(!crawlerOptions?.ignoreSitemap){
|
if(!crawlerOptions?.ignoreSitemap){
|
||||||
|
console.log('4.1')
|
||||||
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
|
||||||
|
console.log('4.2')
|
||||||
if (sitemapLinks.length > 0) {
|
if (sitemapLinks.length > 0) {
|
||||||
|
console.log('4.3')
|
||||||
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
|
||||||
|
console.log('4.4')
|
||||||
return filteredLinks.map(link => ({ url: link, html: "" }));
|
return filteredLinks.map(link => ({ url: link, html: "" }));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log('5. here OK!')
|
||||||
const urls = await this.crawlUrls(
|
const urls = await this.crawlUrls(
|
||||||
[this.initialUrl],
|
[this.initialUrl],
|
||||||
pageOptions,
|
pageOptions,
|
||||||
|
@ -154,7 +161,7 @@ export class WebCrawler {
|
||||||
inProgress
|
inProgress
|
||||||
);
|
);
|
||||||
|
|
||||||
|
console.log('6. here OK!')
|
||||||
if (
|
if (
|
||||||
urls.length === 0 &&
|
urls.length === 0 &&
|
||||||
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
|
||||||
|
@ -164,6 +171,7 @@ export class WebCrawler {
|
||||||
|
|
||||||
// make sure to run include exclude here again
|
// make sure to run include exclude here again
|
||||||
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
|
||||||
|
console.log('7. here OK!')
|
||||||
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -180,6 +188,7 @@ export class WebCrawler {
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
console.log('crawlUrls - crawl')
|
||||||
const newUrls = await this.crawl(task, pageOptions);
|
const newUrls = await this.crawl(task, pageOptions);
|
||||||
// add the initial url if not already added
|
// add the initial url if not already added
|
||||||
// if (this.visited.size === 1) {
|
// if (this.visited.size === 1) {
|
||||||
|
@ -192,7 +201,7 @@ export class WebCrawler {
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
console.log('---??---')
|
||||||
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
|
||||||
|
|
||||||
if (inProgress && newUrls.length > 0) {
|
if (inProgress && newUrls.length > 0) {
|
||||||
|
@ -210,12 +219,14 @@ export class WebCrawler {
|
||||||
currentDocumentUrl: task,
|
currentDocumentUrl: task,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
console.log('----???----')
|
||||||
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
|
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
|
||||||
if (callback && typeof callback === "function") {
|
if (callback && typeof callback === "function") {
|
||||||
callback();
|
callback();
|
||||||
}
|
}
|
||||||
}, concurrencyLimit);
|
}, concurrencyLimit);
|
||||||
|
|
||||||
|
console.log('crawlUrls - queue.push')
|
||||||
queue.push(
|
queue.push(
|
||||||
urls.filter(
|
urls.filter(
|
||||||
(url) =>
|
(url) =>
|
||||||
|
@ -225,7 +236,9 @@ export class WebCrawler {
|
||||||
if (err) console.error(err);
|
if (err) console.error(err);
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
console.log('crawlUrls - queue.drain')
|
||||||
await queue.drain();
|
await queue.drain();
|
||||||
|
console.log('crawlUrls - return')
|
||||||
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -253,16 +266,22 @@ export class WebCrawler {
|
||||||
|
|
||||||
// If it is the first link, fetch with single url
|
// If it is the first link, fetch with single url
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
|
console.log('crawl scrapSingleUrl...')
|
||||||
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
|
||||||
|
console.log('got a page! lets continue...')
|
||||||
content = page.html ?? "";
|
content = page.html ?? "";
|
||||||
pageStatusCode = page.metadata?.pageStatusCode;
|
pageStatusCode = page.metadata?.pageStatusCode;
|
||||||
pageError = page.metadata?.pageError || undefined;
|
pageError = page.metadata?.pageError || undefined;
|
||||||
} else {
|
} else {
|
||||||
const response = await axios.get(url);
|
// console.log('crawl - else')
|
||||||
|
const response = await axios.get(url, { timeout: 3000 });
|
||||||
|
console.log('crawl - else - response ok')
|
||||||
content = response.data ?? "";
|
content = response.data ?? "";
|
||||||
pageStatusCode = response.status;
|
pageStatusCode = response.status;
|
||||||
pageError = response.statusText != "OK" ? response.statusText : undefined;
|
pageError = response.statusText != "OK" ? response.statusText : undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log('crawl... keep going')
|
||||||
const $ = load(content);
|
const $ = load(content);
|
||||||
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
|
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
|
||||||
|
|
||||||
|
@ -271,14 +290,17 @@ export class WebCrawler {
|
||||||
links.push({ url, html: content, pageStatusCode, pageError });
|
links.push({ url, html: content, pageStatusCode, pageError });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log('crawl... keep going 2')
|
||||||
$("a").each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
const href = $(element).attr("href");
|
const href = $(element).attr("href");
|
||||||
if (href) {
|
if (href) {
|
||||||
|
console.log('href:', href)
|
||||||
let fullUrl = href;
|
let fullUrl = href;
|
||||||
if (!href.startsWith("http")) {
|
if (!href.startsWith("http")) {
|
||||||
fullUrl = new URL(href, this.baseUrl).toString();
|
fullUrl = new URL(href, this.baseUrl).toString();
|
||||||
}
|
}
|
||||||
const urlObj = new URL(fullUrl);
|
const urlObj = new URL(fullUrl);
|
||||||
|
console.log('urlObj:', urlObj)
|
||||||
const path = urlObj.pathname;
|
const path = urlObj.pathname;
|
||||||
|
|
||||||
|
|
||||||
|
@ -295,10 +317,13 @@ export class WebCrawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
console.log('crawl... keep going 3')
|
||||||
|
|
||||||
if (this.visited.size === 1) {
|
if (this.visited.size === 1) {
|
||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log('returning crawl...')
|
||||||
// Create a new list to return to avoid modifying the visited list
|
// Create a new list to return to avoid modifying the visited list
|
||||||
return links.filter((link) => !this.visited.has(link.url));
|
return links.filter((link) => !this.visited.has(link.url));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -385,6 +410,7 @@ export class WebCrawler {
|
||||||
|
|
||||||
//
|
//
|
||||||
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
|
||||||
|
console.log("4.1.1 - Normalizing URL");
|
||||||
const normalizeUrl = (url: string) => {
|
const normalizeUrl = (url: string) => {
|
||||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||||
if (url.endsWith("/")) {
|
if (url.endsWith("/")) {
|
||||||
|
@ -393,46 +419,48 @@ export class WebCrawler {
|
||||||
return url;
|
return url;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
console.log("4.1.2 - Constructing sitemap URL");
|
||||||
const sitemapUrl = url.endsWith("/sitemap.xml")
|
const sitemapUrl = url.endsWith("/sitemap.xml")
|
||||||
? url
|
? url
|
||||||
: `${url}/sitemap.xml`;
|
: `${url}/sitemap.xml`;
|
||||||
|
|
||||||
let sitemapLinks: string[] = [];
|
let sitemapLinks: string[] = [];
|
||||||
|
|
||||||
|
console.log("4.1.3 - Fetching sitemap from constructed URL");
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl);
|
const response = await axios.get(sitemapUrl, { timeout: 3000 });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
|
console.log("4.1.4 - Extracting links from sitemap");
|
||||||
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Error handling for failed sitemap fetch
|
console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
||||||
// console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sitemapLinks.length === 0) {
|
if (sitemapLinks.length === 0) {
|
||||||
// If the first one doesn't work, try the base URL
|
console.log("4.1.5 - Trying base URL sitemap as fallback");
|
||||||
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(baseUrlSitemap);
|
const response = await axios.get(baseUrlSitemap, { timeout: 3000 });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
|
console.log("4.1.6 - Extracting links from base URL sitemap");
|
||||||
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Error handling for failed base URL sitemap fetch
|
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||||
// console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize and check if the URL is present in any of the sitemaps
|
console.log("4.1.7 - Normalizing sitemap links");
|
||||||
const normalizedUrl = normalizeUrl(url);
|
const normalizedUrl = normalizeUrl(url);
|
||||||
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
|
||||||
|
|
||||||
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
|
console.log("4.1.8 - Checking if normalized URL is already included");
|
||||||
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
|
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
|
||||||
// do not push the normalized url
|
console.log("4.1.9 - Adding initial URL to sitemap links");
|
||||||
sitemapLinks.push(url);
|
sitemapLinks.push(url);
|
||||||
}
|
}
|
||||||
|
console.log("4.1.10 - Returning sitemap links");
|
||||||
return sitemapLinks;
|
return sitemapLinks;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -63,11 +63,13 @@ export class WebScraperDataProvider {
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
batchUrls.map(async (url, index) => {
|
batchUrls.map(async (url, index) => {
|
||||||
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
const existingHTML = allHtmls ? allHtmls[i + index] : "";
|
||||||
|
console.log('convertUrlsToDocuments - scrapSingleUrl')
|
||||||
const result = await scrapSingleUrl(
|
const result = await scrapSingleUrl(
|
||||||
url,
|
url,
|
||||||
this.pageOptions,
|
this.pageOptions,
|
||||||
existingHTML
|
existingHTML
|
||||||
);
|
);
|
||||||
|
console.log('convertUrlsToDocuments - result ok')
|
||||||
processedUrls++;
|
processedUrls++;
|
||||||
if (inProgress) {
|
if (inProgress) {
|
||||||
inProgress({
|
inProgress({
|
||||||
|
@ -98,6 +100,7 @@ export class WebScraperDataProvider {
|
||||||
return [] as Document[];
|
return [] as Document[];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
console.log('returning results from convertUrlsToDocuments...')
|
||||||
return results.filter((result) => result !== null) as Document[];
|
return results.filter((result) => result !== null) as Document[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,7 +109,7 @@ export class WebScraperDataProvider {
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
this.validateInitialUrl();
|
this.validateInitialUrl();
|
||||||
|
console.log('2. here OK!')
|
||||||
if (!useCaching) {
|
if (!useCaching) {
|
||||||
return this.processDocumentsWithoutCache(inProgress);
|
return this.processDocumentsWithoutCache(inProgress);
|
||||||
}
|
}
|
||||||
|
@ -175,6 +178,7 @@ export class WebScraperDataProvider {
|
||||||
allowBackwardCrawling: this.allowBackwardCrawling,
|
allowBackwardCrawling: this.allowBackwardCrawling,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
console.log('3. here OK!')
|
||||||
let links = await crawler.start(
|
let links = await crawler.start(
|
||||||
inProgress,
|
inProgress,
|
||||||
this.pageOptions,
|
this.pageOptions,
|
||||||
|
@ -186,21 +190,28 @@ export class WebScraperDataProvider {
|
||||||
this.maxCrawledDepth
|
this.maxCrawledDepth
|
||||||
);
|
);
|
||||||
|
|
||||||
|
console.log("8 - Mapping URLs from links");
|
||||||
let allLinks = links.map((e) => e.url);
|
let allLinks = links.map((e) => e.url);
|
||||||
|
console.log("9 - Mapping HTML content from links");
|
||||||
const allHtmls = links.map((e) => e.html);
|
const allHtmls = links.map((e) => e.html);
|
||||||
|
|
||||||
|
console.log("10 - Checking if only URLs should be returned");
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
return this.returnOnlyUrlsResponse(allLinks, inProgress);
|
return this.returnOnlyUrlsResponse(allLinks, inProgress);
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = [];
|
let documents = [];
|
||||||
|
console.log("11 - Checking if crawler is in fast mode and HTML content is present");
|
||||||
// check if fast mode is enabled and there is html inside the links
|
// check if fast mode is enabled and there is html inside the links
|
||||||
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
if (this.crawlerMode === "fast" && links.some((link) => link.html)) {
|
||||||
|
console.log("12 - Processing links with HTML content in fast mode");
|
||||||
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
documents = await this.processLinks(allLinks, inProgress, allHtmls);
|
||||||
} else {
|
} else {
|
||||||
|
console.log("13 - Processing links in normal mode");
|
||||||
documents = await this.processLinks(allLinks, inProgress);
|
documents = await this.processLinks(allLinks, inProgress);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log("14 - Caching and finalizing documents");
|
||||||
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
return this.cacheAndFinalizeDocuments(documents, allLinks);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -259,14 +270,22 @@ export class WebScraperDataProvider {
|
||||||
|
|
||||||
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
|
links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link));
|
||||||
|
|
||||||
|
console.log('processLinks - convertUrlsToDocuments...')
|
||||||
let documents = await this.convertUrlsToDocuments(
|
let documents = await this.convertUrlsToDocuments(
|
||||||
links,
|
links,
|
||||||
inProgress,
|
inProgress,
|
||||||
allHtmls
|
allHtmls
|
||||||
);
|
);
|
||||||
documents = await this.getSitemapData(this.urls[0], documents);
|
console.log('processLinks - convertUrlsToDocuments - done')
|
||||||
|
|
||||||
|
console.log('processLinks - getSitemapData...')
|
||||||
|
documents = await this.getSitemapData(this.urls[0], documents);
|
||||||
|
console.log('processLinks - getSitemapData - done')
|
||||||
|
|
||||||
|
console.log('processLinks - applyPathReplacements...')
|
||||||
documents = this.applyPathReplacements(documents);
|
documents = this.applyPathReplacements(documents);
|
||||||
|
console.log('processLinks - applyPathReplacements - done')
|
||||||
|
|
||||||
// documents = await this.applyImgAltText(documents);
|
// documents = await this.applyImgAltText(documents);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
|
@ -275,6 +294,7 @@ export class WebScraperDataProvider {
|
||||||
) {
|
) {
|
||||||
documents = await generateCompletions(documents, this.extractorOptions);
|
documents = await generateCompletions(documents, this.extractorOptions);
|
||||||
}
|
}
|
||||||
|
console.log('processLinks - returning...')
|
||||||
return documents.concat(pdfDocuments).concat(docxDocuments);
|
return documents.concat(pdfDocuments).concat(docxDocuments);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -320,8 +340,11 @@ export class WebScraperDataProvider {
|
||||||
documents: Document[],
|
documents: Document[],
|
||||||
links: string[]
|
links: string[]
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
|
console.log('cacheAndFinalizeDocuments - 1')
|
||||||
await this.setCachedDocuments(documents, links);
|
await this.setCachedDocuments(documents, links);
|
||||||
|
console.log('cacheAndFinalizeDocuments - 2')
|
||||||
documents = this.removeChildLinks(documents);
|
documents = this.removeChildLinks(documents);
|
||||||
|
console.log('cacheAndFinalizeDocuments - 3')
|
||||||
return documents.splice(0, this.limit);
|
return documents.splice(0, this.limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -113,13 +113,25 @@ export async function scrapWithScrapingBee(
|
||||||
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
pageOptions: { parsePDF?: boolean } = { parsePDF: true }
|
||||||
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
||||||
try {
|
try {
|
||||||
|
console.log("13. scrapWithScrapingBee - 1")
|
||||||
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
|
||||||
|
console.log("13. scrapWithScrapingBee - 2")
|
||||||
const clientParams = await generateRequestParams(
|
const clientParams = await generateRequestParams(
|
||||||
url,
|
url,
|
||||||
wait_browser,
|
wait_browser,
|
||||||
timeout,
|
timeout,
|
||||||
);
|
);
|
||||||
|
console.log({ url,
|
||||||
|
wait_browser,
|
||||||
|
timeout })
|
||||||
|
console.log({
|
||||||
|
...clientParams,
|
||||||
|
params: {
|
||||||
|
...clientParams.params,
|
||||||
|
'transparent_status_code': 'True'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
console.log("13. scrapWithScrapingBee - 3")
|
||||||
const response = await client.get({
|
const response = await client.get({
|
||||||
...clientParams,
|
...clientParams,
|
||||||
params: {
|
params: {
|
||||||
|
@ -127,7 +139,7 @@ export async function scrapWithScrapingBee(
|
||||||
'transparent_status_code': 'True'
|
'transparent_status_code': 'True'
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
console.log("13. scrapWithScrapingBee - 4")
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
|
@ -140,6 +152,7 @@ export async function scrapWithScrapingBee(
|
||||||
} catch (decodeError) {
|
} catch (decodeError) {
|
||||||
console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`);
|
console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`);
|
||||||
}
|
}
|
||||||
|
console.log("13. scrapWithScrapingBee - 5 - returning ok")
|
||||||
return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined };
|
return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined };
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -396,8 +409,13 @@ export async function scrapSingleUrl(
|
||||||
screenshot = customScrapedContent.screenshot;
|
screenshot = customScrapedContent.screenshot;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
'chegou aqui'
|
||||||
|
)
|
||||||
|
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||||
|
console.log('cleanedHtml')
|
||||||
|
|
||||||
return {
|
return {
|
||||||
text: await parseMarkdown(cleanedHtml),
|
text: await parseMarkdown(cleanedHtml),
|
||||||
|
@ -432,7 +450,9 @@ export async function scrapSingleUrl(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log('attemptScraping - 1')
|
||||||
const attempt = await attemptScraping(urlToScrap, scraper);
|
const attempt = await attemptScraping(urlToScrap, scraper);
|
||||||
|
console.log('attemptScraping - 2 - return ok')
|
||||||
text = attempt.text ?? '';
|
text = attempt.text ?? '';
|
||||||
html = attempt.html ?? '';
|
html = attempt.html ?? '';
|
||||||
screenshot = attempt.screenshot ?? '';
|
screenshot = attempt.screenshot ?? '';
|
||||||
|
@ -451,6 +471,7 @@ export async function scrapSingleUrl(
|
||||||
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
console.log('ok... here we are...')
|
||||||
|
|
||||||
if (!text) {
|
if (!text) {
|
||||||
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
|
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
|
||||||
|
@ -487,6 +508,7 @@ export async function scrapSingleUrl(
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log('returning document...')
|
||||||
return document;
|
return document;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
|
console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
|
||||||
|
|
|
@ -8,7 +8,7 @@ export async function getLinksFromSitemap(
|
||||||
try {
|
try {
|
||||||
let content: string;
|
let content: string;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl);
|
const response = await axios.get(sitemapUrl, { timeout: 3000 });
|
||||||
content = response.data;
|
content = response.data;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
||||||
|
@ -42,7 +42,7 @@ export async function getLinksFromSitemap(
|
||||||
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
|
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
|
||||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl);
|
const response = await axios.get(sitemapUrl, { timeout: 3000 });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
const xml = response.data;
|
const xml = response.data;
|
||||||
const parsedXml = await parseStringPromise(xml);
|
const parsedXml = await parseStringPromise(xml);
|
||||||
|
|
|
@ -43,6 +43,10 @@ export function isUrlBlocked(url: string): boolean {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||||||
|
url = 'https://' + url;
|
||||||
|
}
|
||||||
|
|
||||||
const urlObj = new URL(url);
|
const urlObj = new URL(url);
|
||||||
const hostname = urlObj.hostname.toLowerCase();
|
const hostname = urlObj.hostname.toLowerCase();
|
||||||
|
|
||||||
|
|
|
@ -71,7 +71,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
|
||||||
|
|
||||||
while (attempt < maxAttempts && !resultAvailable) {
|
while (attempt < maxAttempts && !resultAvailable) {
|
||||||
try {
|
try {
|
||||||
resultResponse = await axios.get(resultUrl, { headers });
|
resultResponse = await axios.get(resultUrl, { headers, timeout: 6000 });
|
||||||
if (resultResponse.status === 200) {
|
if (resultResponse.status === 200) {
|
||||||
resultAvailable = true; // Exit condition met
|
resultAvailable = true; // Exit condition met
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -4,7 +4,7 @@ export async function attemptScrapWithRequests(
|
||||||
urlToScrap: string
|
urlToScrap: string
|
||||||
): Promise<string | null> {
|
): Promise<string | null> {
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(urlToScrap);
|
const response = await axios.get(urlToScrap, { timeout: 15000 });
|
||||||
|
|
||||||
if (!response.data) {
|
if (!response.data) {
|
||||||
console.log("Failed normal requests as well");
|
console.log("Failed normal requests as well");
|
||||||
|
|
|
@ -14,6 +14,7 @@ if(process.env.ENV === 'production') {
|
||||||
getWebScraperQueue().process(
|
getWebScraperQueue().process(
|
||||||
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
|
||||||
async function (job, done) {
|
async function (job, done) {
|
||||||
|
console.log('getWebScraperQueue - start')
|
||||||
try {
|
try {
|
||||||
job.progress({
|
job.progress({
|
||||||
current: 1,
|
current: 1,
|
||||||
|
@ -22,11 +23,13 @@ getWebScraperQueue().process(
|
||||||
current_url: "",
|
current_url: "",
|
||||||
});
|
});
|
||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
|
console.log('getWebScraperQueue - startWebScraperPipeline')
|
||||||
const { success, message, docs } = await startWebScraperPipeline({ job });
|
const { success, message, docs } = await startWebScraperPipeline({ job });
|
||||||
|
console.log('getWebScraperQueue - startWebScraperPipeline - done')
|
||||||
const end = Date.now();
|
const end = Date.now();
|
||||||
const timeTakenInSeconds = (end - start) / 1000;
|
const timeTakenInSeconds = (end - start) / 1000;
|
||||||
|
|
||||||
|
console.log('docs.length:', docs.length)
|
||||||
const data = {
|
const data = {
|
||||||
success: success,
|
success: success,
|
||||||
result: {
|
result: {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user