Update crawl.ts

This commit is contained in:
Nicolas 2024-08-19 11:02:24 -03:00
parent 36b35dbc67
commit 8e4ca86463

View File

@ -7,10 +7,22 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
import {
defaultCrawlPageOptions,
defaultCrawlerOptions,
defaultOrigin,
} from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
import {
addCrawlJob,
addCrawlJobs,
crawlToCrawler,
lockURL,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../src/services/queue-service";
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
@ -38,10 +50,12 @@ export async function crawlController(req: Request, res: Response) {
}
}
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
const crawlerOptions = {
...defaultCrawlerOptions,
...req.body.crawlerOptions,
};
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
const limitCheck = crawlerOptions?.limit ?? 1;
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, limitCheck);
@ -63,17 +77,14 @@ export async function crawlController(req: Request, res: Response) {
}
if (isUrlBlocked(url)) {
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
return res.status(403).json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
const mode = req.body.mode ?? "crawl";
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
@ -123,10 +134,12 @@ export async function crawlController(req: Request, res: Response) {
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
const sitemap = sc.crawlerOptions?.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null) {
const jobs = sitemap.map(x => {
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
@ -144,26 +157,35 @@ export async function crawlController(req: Request, res: Response) {
opts: {
jobId: uuid,
priority: 20,
}
},
};
})
});
await lockURLs(id, jobs.map(x => x.data.url));
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
await lockURLs(
id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs);
} else {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
}, {
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
});
const job = await addScrapeJob(
{
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
},
{
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
}
);
await addCrawlJob(id, job.id);
}