feat(crawl): add parameter to treat differing query parameters as different URLs (#892)

* add parameter to crawleroptions

* add code to make it work
This commit is contained in:
Gergő Móricz 2024-11-11 21:36:22 +01:00 committed by GitHub
parent 5cb46dc494
commit a8dc75f762
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 19 additions and 16 deletions

View File

@ -195,6 +195,7 @@ export async function crawlController(req: Request, res: Response) {
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(

View File

@ -76,6 +76,7 @@ export async function batchScrapeController(
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(

View File

@ -133,6 +133,7 @@ export async function crawlController(
await lockURLs(
id,
sc,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(

View File

@ -205,6 +205,7 @@ const crawlerOptions = z.object({
allowExternalLinks: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true),
deduplicateSimilarURLs: z.boolean().default(true),
ignoreQueryParameters: z.boolean().default(false),
}).strict(strictMessage);
// export type CrawlerOptions = {
@ -460,6 +461,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
allowExternalContentLinks: x.allowExternalLinks,
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
};
}
@ -474,6 +476,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
allowExternalLinks: x.allowExternalContentLinks,
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
}),
internalOptions: {
v0CrawlOnlyUrls: x.returnOnlyUrls,

View File

@ -90,9 +90,11 @@ export async function getThrottledJobs(teamId: string): Promise<string[]> {
return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity);
}
export function normalizeURL(url: string): string {
export function normalizeURL(url: string, sc: StoredCrawl): string {
const urlO = new URL(url);
urlO.search = "";
if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) {
urlO.search = "";
}
urlO.hash = "";
return urlO.href;
}
@ -130,12 +132,15 @@ export function generateURLPermutations(url: string | URL): URL[] {
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
if (typeof sc.crawlerOptions?.limit === "number") {
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
if (await redisConnection.scard("crawl:" + id + ":visited_unique") >= sc.crawlerOptions.limit) {
return false;
}
}
url = normalizeURL(url);
url = normalizeURL(url, sc);
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
let res: boolean;
if (!sc.crawlerOptions.deduplicateSimilarURLs) {
@ -150,18 +155,9 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
}
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise<boolean> {
urls = urls.map(url => {
try {
const urlO = new URL(url);
urlO.search = "";
urlO.hash = "";
return urlO.href;
} catch (error) {
logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
}
return url;
return normalizeURL(url, sc);
});
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0

View File

@ -320,7 +320,7 @@ async function processJob(job: Job & { id: string }, token: string) {
if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url) !== normalizeURL(doc.metadata.sourceURL)) {
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) {
logger.debug("Was redirected, locking new URL...");
await lockURL(job.data.crawl_id, sc, doc.metadata.url);
}

View File

@ -155,6 +155,7 @@ export interface CrawlParams {
scrapeOptions?: CrawlScrapeOptions;
webhook?: string;
deduplicateSimilarURLs?: boolean;
ignoreQueryParameters?: boolean;
}
/**