map + search + scrape markdown bug

This commit is contained in:
rafaelsideguide 2024-08-16 17:57:11 -03:00
parent 3fcb21930e
commit 7a61325500
13 changed files with 74 additions and 86 deletions

View File

@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs

View File

@ -65,7 +65,6 @@ BULL_AUTH_KEY= @
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
POSTHOG_HOST= # set if you'd like to send posthog events like job logs

View File

@ -32,8 +32,6 @@ BULL_AUTH_KEY=@
LOGTAIL_KEY=
# set if you have a llamaparse key you'd like to use to parse pdfs
LLAMAPARSE_API_KEY=
# set if you have a serper key you'd like to use as a search api
SERPER_API_KEY=
# set if you'd like to send slack server health status messages
SLACK_WEBHOOK_URL=
# set if you'd like to send posthog events like job logs

View File

@ -142,7 +142,6 @@ export async function searchController(req: Request, res: Response) {
const searchOptions = req.body.searchOptions ?? { limit: 5 };
const jobId = uuidv4();
try {

View File

@ -1,66 +1,63 @@
import { Request, Response } from "express";
import { Logger } from "../../../src/lib/logger";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types";
import { checkTeamCredits } from "../../services/billing/credit_billing";
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import { legacyCrawlerOptions, mapRequestSchema, RequestWithAuth } from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse , MapRequest } from "./types";
import { Logger } from "../../lib/logger";
import { configDotenv } from "dotenv";
import { search } from "../../search";
import { checkAndUpdateURL } from "../../lib/validateUrl";
configDotenv();
export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
req.body = mapRequestSchema.parse(req.body);
console.log(req.body);
// expected req.body
// req.body = {
// url: string
// crawlerOptions:
// }
const id = uuidv4();
let links: string[] = [req.body.url];
const crawlerOptions = legacyCrawlerOptions(req.body);
return res.status(200).json({ success: true, links: [ "test1", "test2" ] });
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions,
pageOptions: {},
team_id: req.auth.team_id,
createdAt: Date.now(),
};
// const mode = req.body.mode ?? "crawl";
const crawler = crawlToCrawler(id, sc);
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.progress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
// const job = await addWebScraperJob({
// url: url,
// mode: mode ?? "crawl", // fix for single urls not working
// crawlerOptions: crawlerOptions,
// team_id: team_id,
// pageOptions: pageOptions,
// origin: req.body.origin ?? defaultOrigin,
// });
// await logCrawl(job.id.toString(), team_id);
// res.json({ jobId: job.id });
try {
sc.robots = await crawler.getRobotsTxt();
} catch (e) {
Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`);
}
const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
sitemap.map(x => { links.push(x.url); });
}
const searchResults = await search({
query: `site:${req.body.url}`,
advanced: false,
num_results: 50,
lang: "en",
country: "us",
location: "United States",
})
if (searchResults.length > 0) {
searchResults.map(x => { links.push(x.url); });
}
links = links.map(x => checkAndUpdateURL(x).url);
links = [...new Set(links)];
return res.status(200).json({
success: true,
links
});
}

View File

@ -212,6 +212,7 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
return {
includeMarkdown: x.formats.includes("markdown"),
includeHtml: x.formats.includes("html"),
includeRawHtml: x.formats.includes("rawHtml"),
onlyIncludeTags: x.includeTags,

View File

@ -11,6 +11,7 @@ export interface Progress {
}
export type PageOptions = {
includeMarkdown?: boolean;
onlyMainContent?: boolean;
includeHtml?: boolean;
includeRawHtml?: boolean;

View File

@ -123,6 +123,7 @@ export async function scrapSingleUrl(
jobId: string,
urlToScrap: string,
pageOptions: PageOptions = {
includeMarkdown: true,
onlyMainContent: true,
includeHtml: false,
includeRawHtml: false,
@ -370,7 +371,7 @@ export async function scrapSingleUrl(
if (screenshot && screenshot.length > 0) {
document = {
content: text,
markdown: text,
markdown: pageOptions.includeMarkdown ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
@ -389,7 +390,7 @@ export async function scrapSingleUrl(
} else {
document = {
content: text,
markdown: text,
markdown: pageOptions.includeMarkdown ? text : undefined,
html: pageOptions.includeHtml ? html : undefined,
rawHtml:
pageOptions.includeRawHtml ||
@ -416,7 +417,7 @@ export async function scrapSingleUrl(
});
return {
content: "",
markdown: "",
markdown: pageOptions.includeMarkdown ? "" : undefined,
html: "",
linksOnPage: pageOptions.includeLinks ? [] : undefined,
metadata: {

View File

@ -4,41 +4,40 @@ import { SearchResult } from "../../src/lib/entities";
dotenv.config();
export async function serper_search(q, options: {
export async function fireEngineSearch(q: string, options: {
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
num_results: number;
numResults: number;
page?: number;
}): Promise<SearchResult[]> {
let data = JSON.stringify({
q: q,
hl: options.lang,
gl: options.country,
lang: options.lang,
country: options.country,
location: options.location,
tbs: options.tbs,
num: options.num_results,
num: options.numResults,
page: options.page ?? 1,
});
if (!process.env.FIRE_ENGINE_BETA_URL) {
return [];
}
let config = {
method: "POST",
url: "https://google.serper.dev/search",
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
headers: {
"X-API-KEY": process.env.SERPER_API_KEY,
"Content-Type": "application/json",
},
data: data,
};
const response = await axios(config);
if (response && response.data && Array.isArray(response.data.organic)) {
return response.data.organic.map((a) => ({
url: a.link,
title: a.title,
description: a.snippet,
}));
if (response && response.data) {
return response.data
} else {
return [];
}

View File

@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
let proxies = null;
if (proxy) {
if (proxy.startsWith("https")) {

View File

@ -1,10 +1,7 @@
import { Logger } from "../../src/lib/logger";
import { SearchResult } from "../../src/lib/entities";
import { google_search } from "./googlesearch";
import { serper_search } from "./serper";
import { googleSearch } from "./googlesearch";
import { fireEngineSearch } from "./fireEngine";
export async function search({
query,
@ -32,10 +29,10 @@ export async function search({
timeout?: number;
}) : Promise<SearchResult[]> {
try {
if (process.env.SERPER_API_KEY ) {
return await serper_search(query, {num_results, tbs, filter, lang, country, location});
if (process.env.FIRE_ENGINE_BETA_URL) {
return await fireEngineSearch(query, {numResults: num_results, tbs, filter, lang, country, location});
}
return await google_search(
return await googleSearch(
query,
advanced,
num_results,
@ -51,5 +48,4 @@ export async function search({
Logger.error(`Error in search function: ${error}`);
return []
}
// if process.env.SERPER_API_KEY is set, use serper
}

View File

@ -15,7 +15,6 @@ x-common-service: &common-service
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
- MODEL_NAME=${MODEL_NAME:-gpt-4o}
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
- SERPER_API_KEY=${SERPER_API_KEY}
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
- LOGTAIL_KEY=${LOGTAIL_KEY}
- BULL_AUTH_KEY=${BULL_AUTH_KEY}

View File

@ -6,7 +6,6 @@ type: Opaque
data:
OPENAI_API_KEY: ""
SLACK_WEBHOOK_URL: ""
SERPER_API_KEY: ""
LLAMAPARSE_API_KEY: ""
LOGTAIL_KEY: ""
BULL_AUTH_KEY: ""