mirror of
https://github.com/mendableai/firecrawl.git
synced 2024-11-16 03:32:22 +08:00
Update crawl.ts
This commit is contained in:
parent
36b35dbc67
commit
8e4ca86463
|
@ -7,10 +7,22 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
|||
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
||||
import {
|
||||
defaultCrawlPageOptions,
|
||||
defaultCrawlerOptions,
|
||||
defaultOrigin,
|
||||
} from "../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||
import {
|
||||
addCrawlJob,
|
||||
addCrawlJobs,
|
||||
crawlToCrawler,
|
||||
lockURL,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||
|
||||
|
@ -38,10 +50,12 @@ export async function crawlController(req: Request, res: Response) {
|
|||
}
|
||||
}
|
||||
|
||||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
||||
const crawlerOptions = {
|
||||
...defaultCrawlerOptions,
|
||||
...req.body.crawlerOptions,
|
||||
};
|
||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||
|
||||
|
||||
const limitCheck = crawlerOptions?.limit ?? 1;
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, limitCheck);
|
||||
|
@ -63,17 +77,14 @@ export async function crawlController(req: Request, res: Response) {
|
|||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return res
|
||||
.status(403)
|
||||
.json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
return res.status(403).json({
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
});
|
||||
}
|
||||
|
||||
const mode = req.body.mode ?? "crawl";
|
||||
|
||||
|
||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||
// try {
|
||||
// const a = new WebScraperDataProvider();
|
||||
|
@ -123,10 +134,12 @@ export async function crawlController(req: Request, res: Response) {
|
|||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
const jobs = sitemap.map(x => {
|
||||
const jobs = sitemap.map((x) => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
|
@ -144,26 +157,35 @@ export async function crawlController(req: Request, res: Response) {
|
|||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
}
|
||||
},
|
||||
};
|
||||
})
|
||||
});
|
||||
|
||||
await lockURLs(id, jobs.map(x => x.data.url));
|
||||
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
|
||||
await lockURLs(
|
||||
id,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId)
|
||||
);
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
} else {
|
||||
await lockURL(id, sc, url);
|
||||
const job = await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
}, {
|
||||
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
||||
});
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
crawl_id: id,
|
||||
},
|
||||
{
|
||||
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
||||
}
|
||||
);
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user