diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 45856543..af97c6f1 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -1,10 +1,6 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; -import { - mapRequestSchema, - RequestWithAuth, - scrapeOptions, -} from "./types"; +import { mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { MapResponse, MapRequest } from "./types"; import { configDotenv } from "dotenv"; @@ -46,6 +42,7 @@ export async function mapController( originUrl: req.body.url, crawlerOptions: { ...req.body, + limit: req.body.sitemapOnly ? 10000000 : limit, scrapeOptions: undefined, }, scrapeOptions: scrapeOptions.parse({}), @@ -57,77 +54,92 @@ export async function mapController( const crawler = crawlToCrawler(id, sc); - let urlWithoutWww = req.body.url.replace("www.", ""); - - let mapUrl = req.body.search - ? `"${req.body.search}" site:${urlWithoutWww}` - : `site:${req.body.url}`; - - const resultsPerPage = 100; - const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); - - const cacheKey = `fireEngineMap:${mapUrl}`; - const cachedResult = null; - - let allResults: any[] = []; - let pagePromises: Promise[] = []; - - if (cachedResult) { - allResults = JSON.parse(cachedResult); - } else { - const fetchPage = async (page: number) => { - return fireEngineMap(mapUrl, { - numResults: resultsPerPage, - page: page, - }); - }; - - pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); - allResults = await Promise.all(pagePromises); - - await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours - } - - // Parallelize sitemap fetch with serper search - const [sitemap, ...searchResults] = await Promise.all([ - req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), - ...(cachedResult ? [] : pagePromises), - ]); - - if (!cachedResult) { - allResults = searchResults; - } - - if (sitemap !== null) { - sitemap.forEach((x) => { - links.push(x.url); - }); - } - - let mapResults = allResults - .flat() - .filter((result) => result !== null && result !== undefined); - - const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit); - if (mapResults.length > minumumCutoff) { - mapResults = mapResults.slice(0, minumumCutoff); - } - - if (mapResults.length > 0) { - if (req.body.search) { - // Ensure all map results are first, maintaining their order - links = [ - mapResults[0].url, - ...mapResults.slice(1).map((x) => x.url), - ...links, - ]; - } else { - mapResults.map((x) => { + // If sitemapOnly is true, only get links from sitemap + if (req.body.sitemapOnly) { + const sitemap = await crawler.tryGetSitemap(); + if (sitemap !== null) { + sitemap.forEach((x) => { links.push(x.url); }); } - } + } else { + let urlWithoutWww = req.body.url.replace("www.", ""); + let mapUrl = req.body.search + ? `"${req.body.search}" site:${urlWithoutWww}` + : `site:${req.body.url}`; + + const resultsPerPage = 100; + const maxPages = Math.ceil( + Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage + ); + + const cacheKey = `fireEngineMap:${mapUrl}`; + const cachedResult = null; + + let allResults: any[] = []; + let pagePromises: Promise[] = []; + + if (cachedResult) { + allResults = JSON.parse(cachedResult); + } else { + const fetchPage = async (page: number) => { + return fireEngineMap(mapUrl, { + numResults: resultsPerPage, + page: page, + }); + }; + + pagePromises = Array.from({ length: maxPages }, (_, i) => + fetchPage(i + 1) + ); + allResults = await Promise.all(pagePromises); + + await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours + } + + // Parallelize sitemap fetch with serper search + const [sitemap, ...searchResults] = await Promise.all([ + req.body.ignoreSitemap ? null : crawler.tryGetSitemap(), + ...(cachedResult ? [] : pagePromises), + ]); + + if (!cachedResult) { + allResults = searchResults; + } + + if (sitemap !== null) { + sitemap.forEach((x) => { + links.push(x.url); + }); + } + + let mapResults = allResults + .flat() + .filter((result) => result !== null && result !== undefined); + + const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit); + if (mapResults.length > minumumCutoff) { + mapResults = mapResults.slice(0, minumumCutoff); + } + + if (mapResults.length > 0) { + if (req.body.search) { + // Ensure all map results are first, maintaining their order + links = [ + mapResults[0].url, + ...mapResults.slice(1).map((x) => x.url), + ...links, + ]; + } else { + mapResults.map((x) => { + links.push(x.url); + }); + } + } + + + } // Perform cosine similarity between the search query and the list of links if (req.body.search) { const searchQuery = req.body.search.toLowerCase(); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index d885e128..e14087e1 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -261,6 +261,7 @@ export const mapRequestSchema = crawlerOptions.extend({ includeSubdomains: z.boolean().default(true), search: z.string().optional(), ignoreSitemap: z.boolean().default(false), + sitemapOnly: z.boolean().default(false), limit: z.number().min(1).max(5000).default(5000), }).strict(strictMessage); diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index f5e43544..30b72d22 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.8.3", + "version": "1.8.4", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 18038945..3ea9d9e1 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -221,6 +221,7 @@ export interface MapParams { search?: string; ignoreSitemap?: boolean; includeSubdomains?: boolean; + sitemapOnly?: boolean; limit?: number; }