fix crawl option conversion
Some checks are pending
STAGING Deploy Images to GHCR / push-app-image (push) Waiting to run

This commit is contained in:
Móricz Gergő 2024-11-05 12:28:44 +01:00
parent 2a96717f67
commit cd534326ba
4 changed files with 24 additions and 9 deletions

View File

@ -15,7 +15,7 @@ import { getScrapeQueue } from "../../../src/services/queue-service";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";
import { fromLegacyCrawlerOptions, fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
import { fromLegacyScrapeOptions, url as urlSchema } from "../v1/types";
import { ZodError } from "zod";
export async function crawlController(req: Request, res: Response) {
@ -140,7 +140,7 @@ export async function crawlController(req: Request, res: Response) {
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions: fromLegacyCrawlerOptions(crawlerOptions),
crawlerOptions,
scrapeOptions,
internalOptions,
team_id,
@ -177,7 +177,7 @@ export async function crawlController(req: Request, res: Response) {
data: {
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
crawlerOptions,
team_id,
plan,
pageOptions: pageOptions,

View File

@ -8,7 +8,7 @@ import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "..
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
import { fromLegacyCrawlerOptions, fromLegacyScrapeOptions } from "../v1/types";
import { fromLegacyScrapeOptions } from "../v1/types";
export async function crawlPreviewController(req: Request, res: Response) {
try {
@ -91,7 +91,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions: fromLegacyCrawlerOptions(crawlerOptions),
crawlerOptions,
scrapeOptions,
internalOptions,
team_id,

View File

@ -5,6 +5,7 @@ import {
crawlRequestSchema,
CrawlResponse,
RequestWithAuth,
toLegacyCrawlerOptions,
} from "./types";
import {
addCrawlJob,
@ -70,7 +71,7 @@ export async function crawlController(
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions,
crawlerOptions: toLegacyCrawlerOptions(crawlerOptions),
scrapeOptions,
internalOptions: {},
team_id: req.auth.team_id,

View File

@ -440,6 +440,20 @@ export interface ResponseWithSentry<
sentry?: string,
}
export function toLegacyCrawlerOptions(x: CrawlerOptions) {
return {
includes: x.includePaths,
excludes: x.excludePaths,
maxCrawledLinks: x.limit,
maxDepth: x.maxDepth,
limit: x.limit,
generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
ignoreSitemap: x.ignoreSitemap,
};
}
export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions; internalOptions: InternalOptions } {
return {
crawlOptions: crawlerOptions.parse({
@ -493,10 +507,10 @@ export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptio
}
}
export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, crawlOptions: CrawlerOptions, internalOptions: InternalOptions} {
export function fromLegacyCombo(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined, crawlerOptions: any): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions} {
const { scrapeOptions, internalOptions: i1 } = fromLegacyScrapeOptions(pageOptions, extractorOptions, timeout);
const { crawlOptions, internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
return { scrapeOptions, crawlOptions, internalOptions: Object.assign(i1, i2) };
const { internalOptions: i2 } = fromLegacyCrawlerOptions(crawlerOptions);
return { scrapeOptions, internalOptions: Object.assign(i1, i2) };
}
export function toLegacyDocument(document: Document, internalOptions: InternalOptions): V0Document | { url: string; } {