mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
fix(WebCrawler): filter out file URLs when taking URLs from sitemap
This commit is contained in:
parent
95c6c63b85
commit
f0e95ce399
|
@ -383,7 +383,7 @@ export class WebCrawler {
|
|||
return linkDomain === baseDomain;
|
||||
}
|
||||
|
||||
private isFile(url: string): boolean {
|
||||
public isFile(url: string): boolean {
|
||||
const fileExtensions = [
|
||||
".png",
|
||||
".jpg",
|
||||
|
|
|
@ -2,6 +2,7 @@ import axios from "axios";
|
|||
import { axiosTimeout } from "../../lib/timeout";
|
||||
import { parseStringPromise } from "xml2js";
|
||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
import { WebCrawler } from "./crawler";
|
||||
|
||||
export async function getLinksFromSitemap(
|
||||
{
|
||||
|
@ -41,7 +42,7 @@ export async function getLinksFromSitemap(
|
|||
}
|
||||
} else if (root && root.url) {
|
||||
for (const url of root.url) {
|
||||
if (url.loc && url.loc.length > 0) {
|
||||
if (url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) {
|
||||
allUrls.push(url.loc[0]);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user