mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Nick:
This commit is contained in:
parent
4d0acc9722
commit
2d78c20d68
|
@ -12,9 +12,11 @@ import {
|
||||||
checkAndUpdateURLForMap,
|
checkAndUpdateURLForMap,
|
||||||
isSameDomain,
|
isSameDomain,
|
||||||
isSameSubdomain,
|
isSameSubdomain,
|
||||||
|
removeDuplicateUrls,
|
||||||
} from "../../lib/validateUrl";
|
} from "../../lib/validateUrl";
|
||||||
import { fireEngineMap } from "../../search/fireEngine";
|
import { fireEngineMap } from "../../search/fireEngine";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
|
import { logJob } from "../../services/logging/log_job";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
|
@ -22,12 +24,14 @@ export async function mapController(
|
||||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||||
res: Response<MapResponse>
|
res: Response<MapResponse>
|
||||||
) {
|
) {
|
||||||
|
const startTime = new Date().getTime();
|
||||||
|
|
||||||
req.body = mapRequestSchema.parse(req.body);
|
req.body = mapRequestSchema.parse(req.body);
|
||||||
|
|
||||||
|
const limit = req.body.limit;
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
let links: string[] = [req.body.url];
|
let links: string[] = [req.body.url];
|
||||||
|
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: req.body.url,
|
originUrl: req.body.url,
|
||||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||||
|
@ -38,10 +42,7 @@ export async function mapController(
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
const sitemap =
|
const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||||
req.body.ignoreSitemap
|
|
||||||
? null
|
|
||||||
: await crawler.tryGetSitemap();
|
|
||||||
|
|
||||||
if (sitemap !== null) {
|
if (sitemap !== null) {
|
||||||
sitemap.map((x) => {
|
sitemap.map((x) => {
|
||||||
|
@ -50,19 +51,24 @@ export async function mapController(
|
||||||
}
|
}
|
||||||
|
|
||||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||||
|
|
||||||
let mapUrl = req.body.search
|
let mapUrl = req.body.search
|
||||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||||
: `site:${req.body.url}`;
|
: `site:${req.body.url}`;
|
||||||
// www. seems to exclude subdomains in some cases
|
// www. seems to exclude subdomains in some cases
|
||||||
const mapResults = await fireEngineMap(mapUrl, {
|
const mapResults = await fireEngineMap(mapUrl, {
|
||||||
numResults: 50,
|
// limit to 50 results (beta)
|
||||||
|
numResults: Math.min(limit, 50),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (mapResults.length > 0) {
|
if (mapResults.length > 0) {
|
||||||
if (req.body.search) {
|
if (req.body.search) {
|
||||||
// Ensure all map results are first, maintaining their order
|
// Ensure all map results are first, maintaining their order
|
||||||
links = [mapResults[0].url, ...mapResults.slice(1).map(x => x.url), ...links];
|
links = [
|
||||||
|
mapResults[0].url,
|
||||||
|
...mapResults.slice(1).map((x) => x.url),
|
||||||
|
...links,
|
||||||
|
];
|
||||||
} else {
|
} else {
|
||||||
mapResults.map((x) => {
|
mapResults.map((x) => {
|
||||||
links.push(x.url);
|
links.push(x.url);
|
||||||
|
@ -72,8 +78,6 @@ export async function mapController(
|
||||||
|
|
||||||
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
|
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// allows for subdomains to be included
|
// allows for subdomains to be included
|
||||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||||
|
|
||||||
|
@ -83,12 +87,32 @@ export async function mapController(
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove duplicates that could be due to http/https or www
|
// remove duplicates that could be due to http/https or www
|
||||||
links = [...new Set(links)];
|
links = removeDuplicateUrls(links);
|
||||||
|
|
||||||
await billTeam(req.auth.team_id, 1);
|
await billTeam(req.auth.team_id, 1);
|
||||||
|
|
||||||
|
const endTime = new Date().getTime();
|
||||||
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
|
||||||
|
logJob({
|
||||||
|
job_id: id,
|
||||||
|
success: true,
|
||||||
|
message: "Map completed",
|
||||||
|
num_docs: 1,
|
||||||
|
docs: links,
|
||||||
|
time_taken: timeTakenInSeconds,
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
mode: "map",
|
||||||
|
url: req.body.url,
|
||||||
|
crawlerOptions: {},
|
||||||
|
pageOptions: {},
|
||||||
|
origin: req.body.origin,
|
||||||
|
extractor_options: { mode: "markdown" },
|
||||||
|
num_tokens: 0,
|
||||||
|
});
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
links,
|
links: links.slice(0, limit),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -121,6 +121,7 @@ export const mapRequestSchema = crawlerOptions.extend({
|
||||||
includeSubdomains: z.boolean().default(true),
|
includeSubdomains: z.boolean().default(true),
|
||||||
search: z.string().optional(),
|
search: z.string().optional(),
|
||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
|
limit: z.number().min(1).max(50).default(5000),
|
||||||
}).strict(strictMessage);
|
}).strict(strictMessage);
|
||||||
|
|
||||||
// export type MapRequest = {
|
// export type MapRequest = {
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import { isSameDomain } from "./validateUrl";
|
import { isSameDomain, removeDuplicateUrls } from "./validateUrl";
|
||||||
import { isSameSubdomain } from "./validateUrl";
|
import { isSameSubdomain } from "./validateUrl";
|
||||||
|
|
||||||
describe("isSameDomain", () => {
|
describe("isSameDomain", () => {
|
||||||
|
@ -85,4 +85,75 @@ describe("isSameSubdomain", () => {
|
||||||
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
|
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
|
||||||
expect(result).toBe(false);
|
expect(result).toBe(false);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("removeDuplicateUrls", () => {
|
||||||
|
it("should remove duplicate URLs with different protocols", () => {
|
||||||
|
const urls = [
|
||||||
|
"http://example.com",
|
||||||
|
"https://example.com",
|
||||||
|
"http://www.example.com",
|
||||||
|
"https://www.example.com"
|
||||||
|
];
|
||||||
|
const result = removeDuplicateUrls(urls);
|
||||||
|
expect(result).toEqual(["https://example.com"]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should keep URLs with different paths", () => {
|
||||||
|
const urls = [
|
||||||
|
"https://example.com/page1",
|
||||||
|
"https://example.com/page2",
|
||||||
|
"https://example.com/page1?param=1",
|
||||||
|
"https://example.com/page1#section1"
|
||||||
|
];
|
||||||
|
const result = removeDuplicateUrls(urls);
|
||||||
|
expect(result).toEqual([
|
||||||
|
"https://example.com/page1",
|
||||||
|
"https://example.com/page2",
|
||||||
|
"https://example.com/page1?param=1",
|
||||||
|
"https://example.com/page1#section1"
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should prefer https over http", () => {
|
||||||
|
const urls = [
|
||||||
|
"http://example.com",
|
||||||
|
"https://example.com"
|
||||||
|
];
|
||||||
|
const result = removeDuplicateUrls(urls);
|
||||||
|
expect(result).toEqual(["https://example.com"]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should prefer non-www over www", () => {
|
||||||
|
const urls = [
|
||||||
|
"https://www.example.com",
|
||||||
|
"https://example.com"
|
||||||
|
];
|
||||||
|
const result = removeDuplicateUrls(urls);
|
||||||
|
expect(result).toEqual(["https://example.com"]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle empty input", () => {
|
||||||
|
const urls: string[] = [];
|
||||||
|
const result = removeDuplicateUrls(urls);
|
||||||
|
expect(result).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle URLs with different cases", () => {
|
||||||
|
const urls = [
|
||||||
|
"https://EXAMPLE.com",
|
||||||
|
"https://example.com"
|
||||||
|
];
|
||||||
|
const result = removeDuplicateUrls(urls);
|
||||||
|
expect(result).toEqual(["https://EXAMPLE.com"]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle URLs with trailing slashes", () => {
|
||||||
|
const urls = [
|
||||||
|
"https://example.com",
|
||||||
|
"https://example.com/"
|
||||||
|
];
|
||||||
|
const result = removeDuplicateUrls(urls);
|
||||||
|
expect(result).toEqual(["https://example.com"]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
|
@ -120,3 +120,32 @@ export const checkAndUpdateURLForMap = (url: string) => {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
export function removeDuplicateUrls(urls: string[]): string[] {
|
||||||
|
const urlMap = new Map<string, string>();
|
||||||
|
|
||||||
|
for (const url of urls) {
|
||||||
|
const parsedUrl = new URL(url);
|
||||||
|
const protocol = parsedUrl.protocol;
|
||||||
|
const hostname = parsedUrl.hostname.replace(/^www\./, '');
|
||||||
|
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
|
||||||
|
|
||||||
|
const key = `${hostname}${path}`;
|
||||||
|
|
||||||
|
if (!urlMap.has(key)) {
|
||||||
|
urlMap.set(key, url);
|
||||||
|
} else {
|
||||||
|
const existingUrl = new URL(urlMap.get(key)!);
|
||||||
|
const existingProtocol = existingUrl.protocol;
|
||||||
|
|
||||||
|
if (protocol === 'https:' && existingProtocol === 'http:') {
|
||||||
|
urlMap.set(key, url);
|
||||||
|
} else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) {
|
||||||
|
urlMap.set(key, url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return [...new Set(Array.from(urlMap.values()))];
|
||||||
|
}
|
|
@ -24,6 +24,7 @@ export async function fireEngineMap(q: string, options: {
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
|
console.warn("(v1/map Beta) Results might differ from cloud offering currently.");
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user