mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
map + search + scrape markdown bug
This commit is contained in:
parent
3fcb21930e
commit
7a61325500
|
@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
|
|||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
|
|
|
@ -65,7 +65,6 @@ BULL_AUTH_KEY= @
|
|||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
|
|
|
@ -32,8 +32,6 @@ BULL_AUTH_KEY=@
|
|||
LOGTAIL_KEY=
|
||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
LLAMAPARSE_API_KEY=
|
||||
# set if you have a serper key you'd like to use as a search api
|
||||
SERPER_API_KEY=
|
||||
# set if you'd like to send slack server health status messages
|
||||
SLACK_WEBHOOK_URL=
|
||||
# set if you'd like to send posthog events like job logs
|
||||
|
|
|
@ -142,7 +142,6 @@ export async function searchController(req: Request, res: Response) {
|
|||
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 5 };
|
||||
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
try {
|
||||
|
|
|
@ -1,66 +1,63 @@
|
|||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types";
|
||||
import { checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { legacyCrawlerOptions, mapRequestSchema, RequestWithAuth } from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { MapResponse , MapRequest } from "./types";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { search } from "../../search";
|
||||
import { checkAndUpdateURL } from "../../lib/validateUrl";
|
||||
|
||||
configDotenv();
|
||||
|
||||
export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
console.log(req.body);
|
||||
// expected req.body
|
||||
|
||||
// req.body = {
|
||||
// url: string
|
||||
// crawlerOptions:
|
||||
// }
|
||||
const id = uuidv4();
|
||||
let links: string[] = [req.body.url];
|
||||
|
||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||
|
||||
return res.status(200).json({ success: true, links: [ "test1", "test2" ] });
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions,
|
||||
pageOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
// const mode = req.body.mode ?? "crawl";
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
} catch (e) {
|
||||
Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`);
|
||||
}
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
// await a.setOptions({
|
||||
// jobId: uuidv4(),
|
||||
// mode: "single_urls",
|
||||
// urls: [url],
|
||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
||||
// pageOptions: pageOptions,
|
||||
// });
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
|
||||
// const docs = await a.getDocuments(false, (progress) => {
|
||||
// job.progress({
|
||||
// current: progress.current,
|
||||
// total: progress.total,
|
||||
// current_step: "SCRAPING",
|
||||
// current_url: progress.currentDocumentUrl,
|
||||
// });
|
||||
// });
|
||||
// return res.json({
|
||||
// success: true,
|
||||
// documents: docs,
|
||||
// });
|
||||
// } catch (error) {
|
||||
// Logger.error(error);
|
||||
// return res.status(500).json({ error: error.message });
|
||||
// }
|
||||
// }
|
||||
if (sitemap !== null) {
|
||||
sitemap.map(x => { links.push(x.url); });
|
||||
}
|
||||
|
||||
// const job = await addWebScraperJob({
|
||||
// url: url,
|
||||
// mode: mode ?? "crawl", // fix for single urls not working
|
||||
// crawlerOptions: crawlerOptions,
|
||||
// team_id: team_id,
|
||||
// pageOptions: pageOptions,
|
||||
// origin: req.body.origin ?? defaultOrigin,
|
||||
// });
|
||||
const searchResults = await search({
|
||||
query: `site:${req.body.url}`,
|
||||
advanced: false,
|
||||
num_results: 50,
|
||||
lang: "en",
|
||||
country: "us",
|
||||
location: "United States",
|
||||
})
|
||||
|
||||
// await logCrawl(job.id.toString(), team_id);
|
||||
if (searchResults.length > 0) {
|
||||
searchResults.map(x => { links.push(x.url); });
|
||||
}
|
||||
|
||||
// res.json({ jobId: job.id });
|
||||
links = links.map(x => checkAndUpdateURL(x).url);
|
||||
links = [...new Set(links)];
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
links
|
||||
});
|
||||
}
|
||||
|
|
|
@ -212,6 +212,7 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
|
|||
|
||||
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
return {
|
||||
includeMarkdown: x.formats.includes("markdown"),
|
||||
includeHtml: x.formats.includes("html"),
|
||||
includeRawHtml: x.formats.includes("rawHtml"),
|
||||
onlyIncludeTags: x.includeTags,
|
||||
|
|
|
@ -11,6 +11,7 @@ export interface Progress {
|
|||
}
|
||||
|
||||
export type PageOptions = {
|
||||
includeMarkdown?: boolean;
|
||||
onlyMainContent?: boolean;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
|
|
|
@ -123,6 +123,7 @@ export async function scrapSingleUrl(
|
|||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = {
|
||||
includeMarkdown: true,
|
||||
onlyMainContent: true,
|
||||
includeHtml: false,
|
||||
includeRawHtml: false,
|
||||
|
@ -370,7 +371,7 @@ export async function scrapSingleUrl(
|
|||
if (screenshot && screenshot.length > 0) {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
markdown: pageOptions.includeMarkdown ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
|
@ -389,7 +390,7 @@ export async function scrapSingleUrl(
|
|||
} else {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
markdown: pageOptions.includeMarkdown ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
|
@ -416,7 +417,7 @@ export async function scrapSingleUrl(
|
|||
});
|
||||
return {
|
||||
content: "",
|
||||
markdown: "",
|
||||
markdown: pageOptions.includeMarkdown ? "" : undefined,
|
||||
html: "",
|
||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
||||
metadata: {
|
||||
|
|
|
@ -4,42 +4,41 @@ import { SearchResult } from "../../src/lib/entities";
|
|||
|
||||
dotenv.config();
|
||||
|
||||
export async function serper_search(q, options: {
|
||||
export async function fireEngineSearch(q: string, options: {
|
||||
tbs?: string;
|
||||
filter?: string;
|
||||
lang?: string;
|
||||
country?: string;
|
||||
location?: string;
|
||||
num_results: number;
|
||||
numResults: number;
|
||||
page?: number;
|
||||
}): Promise<SearchResult[]> {
|
||||
let data = JSON.stringify({
|
||||
q: q,
|
||||
hl: options.lang,
|
||||
gl: options.country,
|
||||
lang: options.lang,
|
||||
country: options.country,
|
||||
location: options.location,
|
||||
tbs: options.tbs,
|
||||
num: options.num_results,
|
||||
num: options.numResults,
|
||||
page: options.page ?? 1,
|
||||
});
|
||||
|
||||
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||
return [];
|
||||
}
|
||||
|
||||
let config = {
|
||||
method: "POST",
|
||||
url: "https://google.serper.dev/search",
|
||||
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
|
||||
headers: {
|
||||
"X-API-KEY": process.env.SERPER_API_KEY,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
data: data,
|
||||
};
|
||||
const response = await axios(config);
|
||||
if (response && response.data && Array.isArray(response.data.organic)) {
|
||||
return response.data.organic.map((a) => ({
|
||||
url: a.link,
|
||||
title: a.title,
|
||||
description: a.snippet,
|
||||
}));
|
||||
}else{
|
||||
if (response && response.data) {
|
||||
return response.data
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
}
|
|
@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
|
|||
|
||||
|
||||
|
||||
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||
export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||
let proxies = null;
|
||||
if (proxy) {
|
||||
if (proxy.startsWith("https")) {
|
||||
|
|
|
@ -1,10 +1,7 @@
|
|||
import { Logger } from "../../src/lib/logger";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { google_search } from "./googlesearch";
|
||||
import { serper_search } from "./serper";
|
||||
|
||||
|
||||
|
||||
import { googleSearch } from "./googlesearch";
|
||||
import { fireEngineSearch } from "./fireEngine";
|
||||
|
||||
export async function search({
|
||||
query,
|
||||
|
@ -32,10 +29,10 @@ export async function search({
|
|||
timeout?: number;
|
||||
}) : Promise<SearchResult[]> {
|
||||
try {
|
||||
if (process.env.SERPER_API_KEY ) {
|
||||
return await serper_search(query, {num_results, tbs, filter, lang, country, location});
|
||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||
return await fireEngineSearch(query, {numResults: num_results, tbs, filter, lang, country, location});
|
||||
}
|
||||
return await google_search(
|
||||
return await googleSearch(
|
||||
query,
|
||||
advanced,
|
||||
num_results,
|
||||
|
@ -51,5 +48,4 @@ export async function search({
|
|||
Logger.error(`Error in search function: ${error}`);
|
||||
return []
|
||||
}
|
||||
// if process.env.SERPER_API_KEY is set, use serper
|
||||
}
|
||||
|
|
|
@ -15,7 +15,6 @@ x-common-service: &common-service
|
|||
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
|
||||
- MODEL_NAME=${MODEL_NAME:-gpt-4o}
|
||||
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
|
||||
- SERPER_API_KEY=${SERPER_API_KEY}
|
||||
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
|
||||
- LOGTAIL_KEY=${LOGTAIL_KEY}
|
||||
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
|
||||
|
|
|
@ -6,7 +6,6 @@ type: Opaque
|
|||
data:
|
||||
OPENAI_API_KEY: ""
|
||||
SLACK_WEBHOOK_URL: ""
|
||||
SERPER_API_KEY: ""
|
||||
LLAMAPARSE_API_KEY: ""
|
||||
LOGTAIL_KEY: ""
|
||||
BULL_AUTH_KEY: ""
|
||||
|
|
Loading…
Reference in New Issue
Block a user