From 7a613255009edd70d6662bee33586f2add5270b7 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 16 Aug 2024 17:57:11 -0300 Subject: [PATCH] map + search + scrape markdown bug --- CONTRIBUTING.md | 1 - SELF_HOST.md | 1 - apps/api/.env.example | 2 - apps/api/src/controllers/v0/search.ts | 1 - apps/api/src/controllers/v1/map.ts | 101 +++++++++--------- apps/api/src/controllers/v1/types.ts | 1 + apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/single_url.ts | 7 +- .../src/search/{serper.ts => fireEngine.ts} | 27 +++-- apps/api/src/search/googlesearch.ts | 2 +- apps/api/src/search/index.ts | 14 +-- docker-compose.yaml | 1 - .../kubernetes/cluster-install/secret.yaml | 1 - 13 files changed, 74 insertions(+), 86 deletions(-) rename apps/api/src/search/{serper.ts => fireEngine.ts} (56%) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cece879b..d0145a6b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,7 +44,6 @@ BULL_AUTH_KEY= @ LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs -SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs diff --git a/SELF_HOST.md b/SELF_HOST.md index 75066e48..f631cf18 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -65,7 +65,6 @@ BULL_AUTH_KEY= @ LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs -SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs POSTHOG_HOST= # set if you'd like to send posthog events like job logs diff --git a/apps/api/.env.example b/apps/api/.env.example index d607fe7f..f3c1dc1b 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -32,8 +32,6 @@ BULL_AUTH_KEY=@ LOGTAIL_KEY= # set if you have a llamaparse key you'd like to use to parse pdfs LLAMAPARSE_API_KEY= -# set if you have a serper key you'd like to use as a search api -SERPER_API_KEY= # set if you'd like to send slack server health status messages SLACK_WEBHOOK_URL= # set if you'd like to send posthog events like job logs diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 73d8b678..948e883d 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -142,7 +142,6 @@ export async function searchController(req: Request, res: Response) { const searchOptions = req.body.searchOptions ?? { limit: 5 }; - const jobId = uuidv4(); try { diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 77167b2a..5dbe4b66 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -1,66 +1,63 @@ -import { Request, Response } from "express"; -import { Logger } from "../../../src/lib/logger"; -import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; -import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types"; -import { checkTeamCredits } from "../../services/billing/credit_billing"; +import { Response } from "express"; +import { v4 as uuidv4 } from "uuid"; +import { legacyCrawlerOptions, mapRequestSchema, RequestWithAuth } from "./types"; +import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; +import { MapResponse , MapRequest } from "./types"; +import { Logger } from "../../lib/logger"; +import { configDotenv } from "dotenv"; +import { search } from "../../search"; +import { checkAndUpdateURL } from "../../lib/validateUrl"; + +configDotenv(); export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response) { req.body = mapRequestSchema.parse(req.body); - console.log(req.body); - // expected req.body - // req.body = { - // url: string - // crawlerOptions: - // } + const id = uuidv4(); + let links: string[] = [req.body.url]; + const crawlerOptions = legacyCrawlerOptions(req.body); - return res.status(200).json({ success: true, links: [ "test1", "test2" ] }); + const sc: StoredCrawl = { + originUrl: req.body.url, + crawlerOptions, + pageOptions: {}, + team_id: req.auth.team_id, + createdAt: Date.now(), + }; - // const mode = req.body.mode ?? "crawl"; + const crawler = crawlToCrawler(id, sc); - // const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; - // const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; + try { + sc.robots = await crawler.getRobotsTxt(); + } catch (e) { + Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`); + } - // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? - // try { - // const a = new WebScraperDataProvider(); - // await a.setOptions({ - // jobId: uuidv4(), - // mode: "single_urls", - // urls: [url], - // crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, - // pageOptions: pageOptions, - // }); + const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap(); - // const docs = await a.getDocuments(false, (progress) => { - // job.progress({ - // current: progress.current, - // total: progress.total, - // current_step: "SCRAPING", - // current_url: progress.currentDocumentUrl, - // }); - // }); - // return res.json({ - // success: true, - // documents: docs, - // }); - // } catch (error) { - // Logger.error(error); - // return res.status(500).json({ error: error.message }); - // } - // } + if (sitemap !== null) { + sitemap.map(x => { links.push(x.url); }); + } - // const job = await addWebScraperJob({ - // url: url, - // mode: mode ?? "crawl", // fix for single urls not working - // crawlerOptions: crawlerOptions, - // team_id: team_id, - // pageOptions: pageOptions, - // origin: req.body.origin ?? defaultOrigin, - // }); + const searchResults = await search({ + query: `site:${req.body.url}`, + advanced: false, + num_results: 50, + lang: "en", + country: "us", + location: "United States", + }) - // await logCrawl(job.id.toString(), team_id); + if (searchResults.length > 0) { + searchResults.map(x => { links.push(x.url); }); + } - // res.json({ jobId: job.id }); + links = links.map(x => checkAndUpdateURL(x).url); + links = [...new Set(links)]; + + return res.status(200).json({ + success: true, + links + }); } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 06a3136e..f43b433a 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -212,6 +212,7 @@ export function legacyCrawlerOptions(x: CrawlerOptions) { export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { return { + includeMarkdown: x.formats.includes("markdown"), includeHtml: x.formats.includes("html"), includeRawHtml: x.formats.includes("rawHtml"), onlyIncludeTags: x.includeTags, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 361017e8..5e26360a 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -11,6 +11,7 @@ export interface Progress { } export type PageOptions = { + includeMarkdown?: boolean; onlyMainContent?: boolean; includeHtml?: boolean; includeRawHtml?: boolean; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 434464ae..c2bbbc7b 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -123,6 +123,7 @@ export async function scrapSingleUrl( jobId: string, urlToScrap: string, pageOptions: PageOptions = { + includeMarkdown: true, onlyMainContent: true, includeHtml: false, includeRawHtml: false, @@ -370,7 +371,7 @@ export async function scrapSingleUrl( if (screenshot && screenshot.length > 0) { document = { content: text, - markdown: text, + markdown: pageOptions.includeMarkdown ? text : undefined, html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || @@ -389,7 +390,7 @@ export async function scrapSingleUrl( } else { document = { content: text, - markdown: text, + markdown: pageOptions.includeMarkdown ? text : undefined, html: pageOptions.includeHtml ? html : undefined, rawHtml: pageOptions.includeRawHtml || @@ -416,7 +417,7 @@ export async function scrapSingleUrl( }); return { content: "", - markdown: "", + markdown: pageOptions.includeMarkdown ? "" : undefined, html: "", linksOnPage: pageOptions.includeLinks ? [] : undefined, metadata: { diff --git a/apps/api/src/search/serper.ts b/apps/api/src/search/fireEngine.ts similarity index 56% rename from apps/api/src/search/serper.ts rename to apps/api/src/search/fireEngine.ts index be716367..f0a0303d 100644 --- a/apps/api/src/search/serper.ts +++ b/apps/api/src/search/fireEngine.ts @@ -4,42 +4,41 @@ import { SearchResult } from "../../src/lib/entities"; dotenv.config(); -export async function serper_search(q, options: { +export async function fireEngineSearch(q: string, options: { tbs?: string; filter?: string; lang?: string; country?: string; location?: string; - num_results: number; + numResults: number; page?: number; }): Promise { let data = JSON.stringify({ q: q, - hl: options.lang, - gl: options.country, + lang: options.lang, + country: options.country, location: options.location, tbs: options.tbs, - num: options.num_results, + num: options.numResults, page: options.page ?? 1, }); + if (!process.env.FIRE_ENGINE_BETA_URL) { + return []; + } + let config = { method: "POST", - url: "https://google.serper.dev/search", + url: `${process.env.FIRE_ENGINE_BETA_URL}/search`, headers: { - "X-API-KEY": process.env.SERPER_API_KEY, "Content-Type": "application/json", }, data: data, }; const response = await axios(config); - if (response && response.data && Array.isArray(response.data.organic)) { - return response.data.organic.map((a) => ({ - url: a.link, - title: a.title, - description: a.snippet, - })); - }else{ + if (response && response.data) { + return response.data + } else { return []; } } diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 060f4bd8..0e247702 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string -export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { +export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { let proxies = null; if (proxy) { if (proxy.startsWith("https")) { diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index f5bc06e3..b60fd78f 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -1,10 +1,7 @@ import { Logger } from "../../src/lib/logger"; import { SearchResult } from "../../src/lib/entities"; -import { google_search } from "./googlesearch"; -import { serper_search } from "./serper"; - - - +import { googleSearch } from "./googlesearch"; +import { fireEngineSearch } from "./fireEngine"; export async function search({ query, @@ -32,10 +29,10 @@ export async function search({ timeout?: number; }) : Promise { try { - if (process.env.SERPER_API_KEY ) { - return await serper_search(query, {num_results, tbs, filter, lang, country, location}); + if (process.env.FIRE_ENGINE_BETA_URL) { + return await fireEngineSearch(query, {numResults: num_results, tbs, filter, lang, country, location}); } - return await google_search( + return await googleSearch( query, advanced, num_results, @@ -51,5 +48,4 @@ export async function search({ Logger.error(`Error in search function: ${error}`); return [] } - // if process.env.SERPER_API_KEY is set, use serper } diff --git a/docker-compose.yaml b/docker-compose.yaml index 8c160f4a..24b51762 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -15,7 +15,6 @@ x-common-service: &common-service - OPENAI_BASE_URL=${OPENAI_BASE_URL} - MODEL_NAME=${MODEL_NAME:-gpt-4o} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} - - SERPER_API_KEY=${SERPER_API_KEY} - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} - LOGTAIL_KEY=${LOGTAIL_KEY} - BULL_AUTH_KEY=${BULL_AUTH_KEY} diff --git a/examples/kubernetes/cluster-install/secret.yaml b/examples/kubernetes/cluster-install/secret.yaml index 2be96320..6d8eed3b 100644 --- a/examples/kubernetes/cluster-install/secret.yaml +++ b/examples/kubernetes/cluster-install/secret.yaml @@ -6,7 +6,6 @@ type: Opaque data: OPENAI_API_KEY: "" SLACK_WEBHOOK_URL: "" - SERPER_API_KEY: "" LLAMAPARSE_API_KEY: "" LOGTAIL_KEY: "" BULL_AUTH_KEY: ""