This commit is contained in:
Nicolas 2024-08-26 16:56:27 -03:00
parent 4d0acc9722
commit 2d78c20d68
5 changed files with 140 additions and 14 deletions

View File

@ -12,9 +12,11 @@ import {
checkAndUpdateURLForMap,
isSameDomain,
isSameSubdomain,
removeDuplicateUrls,
} from "../../lib/validateUrl";
import { fireEngineMap } from "../../search/fireEngine";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
configDotenv();
@ -22,12 +24,14 @@ export async function mapController(
req: RequestWithAuth<{}, MapResponse, MapRequest>,
res: Response<MapResponse>
) {
const startTime = new Date().getTime();
req.body = mapRequestSchema.parse(req.body);
const limit = req.body.limit;
const id = uuidv4();
let links: string[] = [req.body.url];
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions: legacyCrawlerOptions(req.body),
@ -38,10 +42,7 @@ export async function mapController(
const crawler = crawlToCrawler(id, sc);
const sitemap =
req.body.ignoreSitemap
? null
: await crawler.tryGetSitemap();
const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
sitemap.map((x) => {
@ -56,13 +57,18 @@ export async function mapController(
: `site:${req.body.url}`;
// www. seems to exclude subdomains in some cases
const mapResults = await fireEngineMap(mapUrl, {
numResults: 50,
// limit to 50 results (beta)
numResults: Math.min(limit, 50),
});
if (mapResults.length > 0) {
if (req.body.search) {
// Ensure all map results are first, maintaining their order
links = [mapResults[0].url, ...mapResults.slice(1).map(x => x.url), ...links];
links = [
mapResults[0].url,
...mapResults.slice(1).map((x) => x.url),
...links,
];
} else {
mapResults.map((x) => {
links.push(x.url);
@ -72,8 +78,6 @@ export async function mapController(
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
// allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.url));
@ -83,12 +87,32 @@ export async function mapController(
}
// remove duplicates that could be due to http/https or www
links = [...new Set(links)];
links = removeDuplicateUrls(links);
await billTeam(req.auth.team_id, 1);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
logJob({
job_id: id,
success: true,
message: "Map completed",
num_docs: 1,
docs: links,
time_taken: timeTakenInSeconds,
team_id: req.auth.team_id,
mode: "map",
url: req.body.url,
crawlerOptions: {},
pageOptions: {},
origin: req.body.origin,
extractor_options: { mode: "markdown" },
num_tokens: 0,
});
return res.status(200).json({
success: true,
links,
links: links.slice(0, limit),
});
}

View File

@ -121,6 +121,7 @@ export const mapRequestSchema = crawlerOptions.extend({
includeSubdomains: z.boolean().default(true),
search: z.string().optional(),
ignoreSitemap: z.boolean().default(false),
limit: z.number().min(1).max(50).default(5000),
}).strict(strictMessage);
// export type MapRequest = {

View File

@ -1,4 +1,4 @@
import { isSameDomain } from "./validateUrl";
import { isSameDomain, removeDuplicateUrls } from "./validateUrl";
import { isSameSubdomain } from "./validateUrl";
describe("isSameDomain", () => {
@ -86,3 +86,74 @@ describe("isSameSubdomain", () => {
expect(result).toBe(false);
});
});
describe("removeDuplicateUrls", () => {
it("should remove duplicate URLs with different protocols", () => {
const urls = [
"http://example.com",
"https://example.com",
"http://www.example.com",
"https://www.example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
it("should keep URLs with different paths", () => {
const urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page1?param=1",
"https://example.com/page1#section1"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual([
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page1?param=1",
"https://example.com/page1#section1"
]);
});
it("should prefer https over http", () => {
const urls = [
"http://example.com",
"https://example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
it("should prefer non-www over www", () => {
const urls = [
"https://www.example.com",
"https://example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
it("should handle empty input", () => {
const urls: string[] = [];
const result = removeDuplicateUrls(urls);
expect(result).toEqual([]);
});
it("should handle URLs with different cases", () => {
const urls = [
"https://EXAMPLE.com",
"https://example.com"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://EXAMPLE.com"]);
});
it("should handle URLs with trailing slashes", () => {
const urls = [
"https://example.com",
"https://example.com/"
];
const result = removeDuplicateUrls(urls);
expect(result).toEqual(["https://example.com"]);
});
});

View File

@ -120,3 +120,32 @@ export const checkAndUpdateURLForMap = (url: string) => {
export function removeDuplicateUrls(urls: string[]): string[] {
const urlMap = new Map<string, string>();
for (const url of urls) {
const parsedUrl = new URL(url);
const protocol = parsedUrl.protocol;
const hostname = parsedUrl.hostname.replace(/^www\./, '');
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
const key = `${hostname}${path}`;
if (!urlMap.has(key)) {
urlMap.set(key, url);
} else {
const existingUrl = new URL(urlMap.get(key)!);
const existingProtocol = existingUrl.protocol;
if (protocol === 'https:' && existingProtocol === 'http:') {
urlMap.set(key, url);
} else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) {
urlMap.set(key, url);
}
}
}
return [...new Set(Array.from(urlMap.values()))];
}

View File

@ -24,6 +24,7 @@ export async function fireEngineMap(q: string, options: {
});
if (!process.env.FIRE_ENGINE_BETA_URL) {
console.warn("(v1/map Beta) Results might differ from cloud offering currently.");
return [];
}