feat(crawl): URL deduplication

This commit is contained in:
Gergő Móricz 2024-11-08 16:22:06 +01:00
parent 25e94ffd28
commit 6ecf24b85e
4 changed files with 50 additions and 7 deletions

View File

@ -143,6 +143,7 @@ export const scrapeOptions = z.object({
}).optional(),
skipTlsVerification: z.boolean().default(false),
removeBase64Images: z.boolean().default(true),
deduplicateSimilarURLs: z.boolean().default(true),
}).strict(strictMessage)

View File

@ -90,6 +90,13 @@ export async function getThrottledJobs(teamId: string): Promise<string[]> {
return await redisConnection.zrangebyscore("concurrency-limiter:" + teamId + ":throttled", Date.now(), Infinity);
}
export function normalizeURL(url: string): string {
const urlO = new URL(url);
urlO.search = "";
urlO.hash = "";
return urlO.href;
}
export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise<boolean> {
if (typeof sc.crawlerOptions?.limit === "number") {
if (await redisConnection.scard("crawl:" + id + ":visited") >= sc.crawlerOptions.limit) {
@ -97,16 +104,42 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
}
}
try {
url = normalizeURL(url);
let res: boolean;
if (!sc.scrapeOptions.deduplicateSimilarURLs) {
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
} else {
const urlO = new URL(url);
urlO.search = "";
urlO.hash = "";
url = urlO.href;
} catch (error) {
logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
// Construct two versions, one with www., one without
const urlWithWWW = new URL(urlO);
const urlWithoutWWW = new URL(urlO);
if (urlO.hostname.startsWith("www.")) {
urlWithoutWWW.hostname = urlWithWWW.hostname.slice(4);
} else {
urlWithWWW.hostname = "www." + urlWithoutWWW.hostname;
}
let permutations = [urlWithWWW, urlWithoutWWW];
// Construct more versions for http/https
permutations = permutations.flatMap(urlO => {
if (!["http:", "https:"].includes(urlO.protocol)) {
return [urlO];
}
const urlWithHTTP = new URL(urlO);
const urlWithHTTPS = new URL(urlO);
urlWithHTTP.protocol = "http:";
urlWithHTTPS.protocol = "https:";
return [urlWithHTTP, urlWithHTTPS];
});
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
}
const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}

View File

@ -23,6 +23,7 @@ import {
getCrawl,
getCrawlJobs,
lockURL,
normalizeURL,
} from "../lib/crawl-redis";
import { StoredCrawl } from "../lib/crawl-redis";
import { addScrapeJob } from "./queue-jobs";
@ -305,6 +306,11 @@ async function processJob(job: Job & { id: string }, token: string) {
if (job.data.crawl_id) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url) !== normalizeURL(doc.metadata.sourceURL)) {
logger.debug("Was redirected, locking new URL...");
await lockURL(job.data.crawl_id, sc, doc.metadata.url);
}
await logJob({
job_id: job.id as string,

View File

@ -86,6 +86,9 @@ export interface CrawlScrapeOptions {
country?: string;
languages?: string[];
};
skipTlsVerification?: boolean;
removeBase64Images?: boolean;
deduplicateSimilarURLs?: boolean;
}
export type Action = {