mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
map + search + scrape markdown bug
This commit is contained in:
parent
3fcb21930e
commit
7a61325500
|
@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
|
||||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
|
||||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||||
|
|
|
@ -65,7 +65,6 @@ BULL_AUTH_KEY= @
|
||||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
|
||||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||||
|
|
|
@ -32,8 +32,6 @@ BULL_AUTH_KEY=@
|
||||||
LOGTAIL_KEY=
|
LOGTAIL_KEY=
|
||||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||||
LLAMAPARSE_API_KEY=
|
LLAMAPARSE_API_KEY=
|
||||||
# set if you have a serper key you'd like to use as a search api
|
|
||||||
SERPER_API_KEY=
|
|
||||||
# set if you'd like to send slack server health status messages
|
# set if you'd like to send slack server health status messages
|
||||||
SLACK_WEBHOOK_URL=
|
SLACK_WEBHOOK_URL=
|
||||||
# set if you'd like to send posthog events like job logs
|
# set if you'd like to send posthog events like job logs
|
||||||
|
|
|
@ -142,7 +142,6 @@ export async function searchController(req: Request, res: Response) {
|
||||||
|
|
||||||
const searchOptions = req.body.searchOptions ?? { limit: 5 };
|
const searchOptions = req.body.searchOptions ?? { limit: 5 };
|
||||||
|
|
||||||
|
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -1,66 +1,63 @@
|
||||||
import { Request, Response } from "express";
|
import { Response } from "express";
|
||||||
import { Logger } from "../../../src/lib/logger";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
import { legacyCrawlerOptions, mapRequestSchema, RequestWithAuth } from "./types";
|
||||||
import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types";
|
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||||
import { checkTeamCredits } from "../../services/billing/credit_billing";
|
import { MapResponse , MapRequest } from "./types";
|
||||||
|
import { Logger } from "../../lib/logger";
|
||||||
|
import { configDotenv } from "dotenv";
|
||||||
|
import { search } from "../../search";
|
||||||
|
import { checkAndUpdateURL } from "../../lib/validateUrl";
|
||||||
|
|
||||||
|
configDotenv();
|
||||||
|
|
||||||
export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
|
export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
|
||||||
req.body = mapRequestSchema.parse(req.body);
|
req.body = mapRequestSchema.parse(req.body);
|
||||||
console.log(req.body);
|
|
||||||
// expected req.body
|
|
||||||
|
|
||||||
// req.body = {
|
const id = uuidv4();
|
||||||
// url: string
|
let links: string[] = [req.body.url];
|
||||||
// crawlerOptions:
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||||
|
|
||||||
return res.status(200).json({ success: true, links: [ "test1", "test2" ] });
|
const sc: StoredCrawl = {
|
||||||
|
originUrl: req.body.url,
|
||||||
|
crawlerOptions,
|
||||||
|
pageOptions: {},
|
||||||
|
team_id: req.auth.team_id,
|
||||||
|
createdAt: Date.now(),
|
||||||
|
};
|
||||||
|
|
||||||
// const mode = req.body.mode ?? "crawl";
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
// const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
try {
|
||||||
// const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
sc.robots = await crawler.getRobotsTxt();
|
||||||
|
} catch (e) {
|
||||||
|
Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`);
|
||||||
|
}
|
||||||
|
|
||||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||||
// try {
|
|
||||||
// const a = new WebScraperDataProvider();
|
|
||||||
// await a.setOptions({
|
|
||||||
// jobId: uuidv4(),
|
|
||||||
// mode: "single_urls",
|
|
||||||
// urls: [url],
|
|
||||||
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
|
|
||||||
// pageOptions: pageOptions,
|
|
||||||
// });
|
|
||||||
|
|
||||||
// const docs = await a.getDocuments(false, (progress) => {
|
if (sitemap !== null) {
|
||||||
// job.progress({
|
sitemap.map(x => { links.push(x.url); });
|
||||||
// current: progress.current,
|
}
|
||||||
// total: progress.total,
|
|
||||||
// current_step: "SCRAPING",
|
|
||||||
// current_url: progress.currentDocumentUrl,
|
|
||||||
// });
|
|
||||||
// });
|
|
||||||
// return res.json({
|
|
||||||
// success: true,
|
|
||||||
// documents: docs,
|
|
||||||
// });
|
|
||||||
// } catch (error) {
|
|
||||||
// Logger.error(error);
|
|
||||||
// return res.status(500).json({ error: error.message });
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// const job = await addWebScraperJob({
|
const searchResults = await search({
|
||||||
// url: url,
|
query: `site:${req.body.url}`,
|
||||||
// mode: mode ?? "crawl", // fix for single urls not working
|
advanced: false,
|
||||||
// crawlerOptions: crawlerOptions,
|
num_results: 50,
|
||||||
// team_id: team_id,
|
lang: "en",
|
||||||
// pageOptions: pageOptions,
|
country: "us",
|
||||||
// origin: req.body.origin ?? defaultOrigin,
|
location: "United States",
|
||||||
// });
|
})
|
||||||
|
|
||||||
// await logCrawl(job.id.toString(), team_id);
|
if (searchResults.length > 0) {
|
||||||
|
searchResults.map(x => { links.push(x.url); });
|
||||||
|
}
|
||||||
|
|
||||||
// res.json({ jobId: job.id });
|
links = links.map(x => checkAndUpdateURL(x).url);
|
||||||
|
links = [...new Set(links)];
|
||||||
|
|
||||||
|
return res.status(200).json({
|
||||||
|
success: true,
|
||||||
|
links
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -212,6 +212,7 @@ export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||||
|
|
||||||
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||||
return {
|
return {
|
||||||
|
includeMarkdown: x.formats.includes("markdown"),
|
||||||
includeHtml: x.formats.includes("html"),
|
includeHtml: x.formats.includes("html"),
|
||||||
includeRawHtml: x.formats.includes("rawHtml"),
|
includeRawHtml: x.formats.includes("rawHtml"),
|
||||||
onlyIncludeTags: x.includeTags,
|
onlyIncludeTags: x.includeTags,
|
||||||
|
|
|
@ -11,6 +11,7 @@ export interface Progress {
|
||||||
}
|
}
|
||||||
|
|
||||||
export type PageOptions = {
|
export type PageOptions = {
|
||||||
|
includeMarkdown?: boolean;
|
||||||
onlyMainContent?: boolean;
|
onlyMainContent?: boolean;
|
||||||
includeHtml?: boolean;
|
includeHtml?: boolean;
|
||||||
includeRawHtml?: boolean;
|
includeRawHtml?: boolean;
|
||||||
|
|
|
@ -123,6 +123,7 @@ export async function scrapSingleUrl(
|
||||||
jobId: string,
|
jobId: string,
|
||||||
urlToScrap: string,
|
urlToScrap: string,
|
||||||
pageOptions: PageOptions = {
|
pageOptions: PageOptions = {
|
||||||
|
includeMarkdown: true,
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
includeRawHtml: false,
|
includeRawHtml: false,
|
||||||
|
@ -370,7 +371,7 @@ export async function scrapSingleUrl(
|
||||||
if (screenshot && screenshot.length > 0) {
|
if (screenshot && screenshot.length > 0) {
|
||||||
document = {
|
document = {
|
||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: pageOptions.includeMarkdown ? text : undefined,
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
rawHtml:
|
rawHtml:
|
||||||
pageOptions.includeRawHtml ||
|
pageOptions.includeRawHtml ||
|
||||||
|
@ -389,7 +390,7 @@ export async function scrapSingleUrl(
|
||||||
} else {
|
} else {
|
||||||
document = {
|
document = {
|
||||||
content: text,
|
content: text,
|
||||||
markdown: text,
|
markdown: pageOptions.includeMarkdown ? text : undefined,
|
||||||
html: pageOptions.includeHtml ? html : undefined,
|
html: pageOptions.includeHtml ? html : undefined,
|
||||||
rawHtml:
|
rawHtml:
|
||||||
pageOptions.includeRawHtml ||
|
pageOptions.includeRawHtml ||
|
||||||
|
@ -416,7 +417,7 @@ export async function scrapSingleUrl(
|
||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
content: "",
|
content: "",
|
||||||
markdown: "",
|
markdown: pageOptions.includeMarkdown ? "" : undefined,
|
||||||
html: "",
|
html: "",
|
||||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
||||||
metadata: {
|
metadata: {
|
||||||
|
|
|
@ -4,42 +4,41 @@ import { SearchResult } from "../../src/lib/entities";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
export async function serper_search(q, options: {
|
export async function fireEngineSearch(q: string, options: {
|
||||||
tbs?: string;
|
tbs?: string;
|
||||||
filter?: string;
|
filter?: string;
|
||||||
lang?: string;
|
lang?: string;
|
||||||
country?: string;
|
country?: string;
|
||||||
location?: string;
|
location?: string;
|
||||||
num_results: number;
|
numResults: number;
|
||||||
page?: number;
|
page?: number;
|
||||||
}): Promise<SearchResult[]> {
|
}): Promise<SearchResult[]> {
|
||||||
let data = JSON.stringify({
|
let data = JSON.stringify({
|
||||||
q: q,
|
q: q,
|
||||||
hl: options.lang,
|
lang: options.lang,
|
||||||
gl: options.country,
|
country: options.country,
|
||||||
location: options.location,
|
location: options.location,
|
||||||
tbs: options.tbs,
|
tbs: options.tbs,
|
||||||
num: options.num_results,
|
num: options.numResults,
|
||||||
page: options.page ?? 1,
|
page: options.page ?? 1,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
let config = {
|
let config = {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
url: "https://google.serper.dev/search",
|
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
|
||||||
headers: {
|
headers: {
|
||||||
"X-API-KEY": process.env.SERPER_API_KEY,
|
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
data: data,
|
data: data,
|
||||||
};
|
};
|
||||||
const response = await axios(config);
|
const response = await axios(config);
|
||||||
if (response && response.data && Array.isArray(response.data.organic)) {
|
if (response && response.data) {
|
||||||
return response.data.organic.map((a) => ({
|
return response.data
|
||||||
url: a.link,
|
} else {
|
||||||
title: a.title,
|
|
||||||
description: a.snippet,
|
|
||||||
}));
|
|
||||||
}else{
|
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||||
let proxies = null;
|
let proxies = null;
|
||||||
if (proxy) {
|
if (proxy) {
|
||||||
if (proxy.startsWith("https")) {
|
if (proxy.startsWith("https")) {
|
||||||
|
|
|
@ -1,10 +1,7 @@
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { SearchResult } from "../../src/lib/entities";
|
import { SearchResult } from "../../src/lib/entities";
|
||||||
import { google_search } from "./googlesearch";
|
import { googleSearch } from "./googlesearch";
|
||||||
import { serper_search } from "./serper";
|
import { fireEngineSearch } from "./fireEngine";
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export async function search({
|
export async function search({
|
||||||
query,
|
query,
|
||||||
|
@ -32,10 +29,10 @@ export async function search({
|
||||||
timeout?: number;
|
timeout?: number;
|
||||||
}) : Promise<SearchResult[]> {
|
}) : Promise<SearchResult[]> {
|
||||||
try {
|
try {
|
||||||
if (process.env.SERPER_API_KEY ) {
|
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
return await serper_search(query, {num_results, tbs, filter, lang, country, location});
|
return await fireEngineSearch(query, {numResults: num_results, tbs, filter, lang, country, location});
|
||||||
}
|
}
|
||||||
return await google_search(
|
return await googleSearch(
|
||||||
query,
|
query,
|
||||||
advanced,
|
advanced,
|
||||||
num_results,
|
num_results,
|
||||||
|
@ -51,5 +48,4 @@ export async function search({
|
||||||
Logger.error(`Error in search function: ${error}`);
|
Logger.error(`Error in search function: ${error}`);
|
||||||
return []
|
return []
|
||||||
}
|
}
|
||||||
// if process.env.SERPER_API_KEY is set, use serper
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,6 @@ x-common-service: &common-service
|
||||||
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
|
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
|
||||||
- MODEL_NAME=${MODEL_NAME:-gpt-4o}
|
- MODEL_NAME=${MODEL_NAME:-gpt-4o}
|
||||||
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
|
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
|
||||||
- SERPER_API_KEY=${SERPER_API_KEY}
|
|
||||||
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
|
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
|
||||||
- LOGTAIL_KEY=${LOGTAIL_KEY}
|
- LOGTAIL_KEY=${LOGTAIL_KEY}
|
||||||
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
|
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
|
||||||
|
|
|
@ -6,7 +6,6 @@ type: Opaque
|
||||||
data:
|
data:
|
||||||
OPENAI_API_KEY: ""
|
OPENAI_API_KEY: ""
|
||||||
SLACK_WEBHOOK_URL: ""
|
SLACK_WEBHOOK_URL: ""
|
||||||
SERPER_API_KEY: ""
|
|
||||||
LLAMAPARSE_API_KEY: ""
|
LLAMAPARSE_API_KEY: ""
|
||||||
LOGTAIL_KEY: ""
|
LOGTAIL_KEY: ""
|
||||||
BULL_AUTH_KEY: ""
|
BULL_AUTH_KEY: ""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user