mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Nick:
This commit is contained in:
parent
4d0acc9722
commit
2d78c20d68
|
@ -12,9 +12,11 @@ import {
|
|||
checkAndUpdateURLForMap,
|
||||
isSameDomain,
|
||||
isSameSubdomain,
|
||||
removeDuplicateUrls,
|
||||
} from "../../lib/validateUrl";
|
||||
import { fireEngineMap } from "../../search/fireEngine";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
|
||||
configDotenv();
|
||||
|
||||
|
@ -22,12 +24,14 @@ export async function mapController(
|
|||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||
res: Response<MapResponse>
|
||||
) {
|
||||
const startTime = new Date().getTime();
|
||||
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
|
||||
const limit = req.body.limit;
|
||||
const id = uuidv4();
|
||||
let links: string[] = [req.body.url];
|
||||
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
|
@ -38,10 +42,7 @@ export async function mapController(
|
|||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
const sitemap =
|
||||
req.body.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
sitemap.map((x) => {
|
||||
|
@ -50,19 +51,24 @@ export async function mapController(
|
|||
}
|
||||
|
||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||
|
||||
|
||||
let mapUrl = req.body.search
|
||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||
: `site:${req.body.url}`;
|
||||
// www. seems to exclude subdomains in some cases
|
||||
const mapResults = await fireEngineMap(mapUrl, {
|
||||
numResults: 50,
|
||||
// limit to 50 results (beta)
|
||||
numResults: Math.min(limit, 50),
|
||||
});
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (req.body.search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [mapResults[0].url, ...mapResults.slice(1).map(x => x.url), ...links];
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
...mapResults.slice(1).map((x) => x.url),
|
||||
...links,
|
||||
];
|
||||
} else {
|
||||
mapResults.map((x) => {
|
||||
links.push(x.url);
|
||||
|
@ -72,8 +78,6 @@ export async function mapController(
|
|||
|
||||
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
|
||||
|
||||
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||
|
||||
|
@ -83,12 +87,32 @@ export async function mapController(
|
|||
}
|
||||
|
||||
// remove duplicates that could be due to http/https or www
|
||||
links = [...new Set(links)];
|
||||
links = removeDuplicateUrls(links);
|
||||
|
||||
await billTeam(req.auth.team_id, 1);
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
||||
logJob({
|
||||
job_id: id,
|
||||
success: true,
|
||||
message: "Map completed",
|
||||
num_docs: 1,
|
||||
docs: links,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "map",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: {},
|
||||
origin: req.body.origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: 0,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
links,
|
||||
links: links.slice(0, limit),
|
||||
});
|
||||
}
|
||||
|
|
|
@ -121,6 +121,7 @@ export const mapRequestSchema = crawlerOptions.extend({
|
|||
includeSubdomains: z.boolean().default(true),
|
||||
search: z.string().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
limit: z.number().min(1).max(50).default(5000),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type MapRequest = {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import { isSameDomain } from "./validateUrl";
|
||||
import { isSameDomain, removeDuplicateUrls } from "./validateUrl";
|
||||
import { isSameSubdomain } from "./validateUrl";
|
||||
|
||||
describe("isSameDomain", () => {
|
||||
|
@ -85,4 +85,75 @@ describe("isSameSubdomain", () => {
|
|||
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("removeDuplicateUrls", () => {
|
||||
it("should remove duplicate URLs with different protocols", () => {
|
||||
const urls = [
|
||||
"http://example.com",
|
||||
"https://example.com",
|
||||
"http://www.example.com",
|
||||
"https://www.example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should keep URLs with different paths", () => {
|
||||
const urls = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page1?param=1",
|
||||
"https://example.com/page1#section1"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual([
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page1?param=1",
|
||||
"https://example.com/page1#section1"
|
||||
]);
|
||||
});
|
||||
|
||||
it("should prefer https over http", () => {
|
||||
const urls = [
|
||||
"http://example.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should prefer non-www over www", () => {
|
||||
const urls = [
|
||||
"https://www.example.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should handle empty input", () => {
|
||||
const urls: string[] = [];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
|
||||
it("should handle URLs with different cases", () => {
|
||||
const urls = [
|
||||
"https://EXAMPLE.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://EXAMPLE.com"]);
|
||||
});
|
||||
|
||||
it("should handle URLs with trailing slashes", () => {
|
||||
const urls = [
|
||||
"https://example.com",
|
||||
"https://example.com/"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -120,3 +120,32 @@ export const checkAndUpdateURLForMap = (url: string) => {
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
export function removeDuplicateUrls(urls: string[]): string[] {
|
||||
const urlMap = new Map<string, string>();
|
||||
|
||||
for (const url of urls) {
|
||||
const parsedUrl = new URL(url);
|
||||
const protocol = parsedUrl.protocol;
|
||||
const hostname = parsedUrl.hostname.replace(/^www\./, '');
|
||||
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
|
||||
|
||||
const key = `${hostname}${path}`;
|
||||
|
||||
if (!urlMap.has(key)) {
|
||||
urlMap.set(key, url);
|
||||
} else {
|
||||
const existingUrl = new URL(urlMap.get(key)!);
|
||||
const existingProtocol = existingUrl.protocol;
|
||||
|
||||
if (protocol === 'https:' && existingProtocol === 'http:') {
|
||||
urlMap.set(key, url);
|
||||
} else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) {
|
||||
urlMap.set(key, url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...new Set(Array.from(urlMap.values()))];
|
||||
}
|
|
@ -24,6 +24,7 @@ export async function fireEngineMap(q: string, options: {
|
|||
});
|
||||
|
||||
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||
console.warn("(v1/map Beta) Results might differ from cloud offering currently.");
|
||||
return [];
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user