Fix for maxDepth

This commit is contained in:
Eric Ciarla 2024-06-14 19:40:37 -04:00
parent 354712a8a3
commit a6b7197737
3 changed files with 76 additions and 5 deletions

View File

@ -619,13 +619,14 @@ describe("E2E Tests for API Routes", () => {
}, 180000); }, 180000);
it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => { it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
const crawlResponse = await request(TEST_URL) const crawlResponse = await request(TEST_URL)
.post("/v0/crawl") .post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json") .set("Content-Type", "application/json")
.send({ .send({
url: "https://www.scrapethissite.com", url: "https://www.mendable.ai",
crawlerOptions: { maxDepth: 0 }, crawlerOptions: { maxDepth: 2 },
}); });
expect(crawlResponse.statusCode).toBe(200); expect(crawlResponse.statusCode).toBe(200);
@ -651,6 +652,70 @@ describe("E2E Tests for API Routes", () => {
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`) .get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`); .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const testurls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
console.log(testurls)
expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed");
expect(completedResponse.body).toHaveProperty("data");
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
const urls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
expect(urls.length).toBeGreaterThanOrEqual(1);
// Check if all URLs have an absolute maximum depth of 3 after the base URL depth was 2 and the maxDepth was 1
urls.forEach((url: string) => {
const pathSplits = new URL(url).pathname.split('/');
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0);
expect(depth).toBeLessThanOrEqual(1);
});
}, 180000);
it.concurrent("should return a successful response with relative max depth option for a valid crawl job with maxDepths equals to zero", async () => {
const crawlResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://www.scrapethissite.com",
crawlerOptions: { maxDepth: 2 },
});
expect(crawlResponse.statusCode).toBe(200);
const response = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("status");
expect(["active", "waiting"]).toContain(response.body.status);
// wait for 60 seconds
let isCompleted = false;
while (!isCompleted) {
const statusCheckResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
expect(statusCheckResponse.statusCode).toBe(200);
isCompleted = statusCheckResponse.body.status === "completed";
if (!isCompleted) {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
const completedResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
const testurls = completedResponse.body.data.map(
(item: any) => item.metadata?.sourceURL
);
console.log(testurls)
expect(completedResponse.statusCode).toBe(200); expect(completedResponse.statusCode).toBe(200);
expect(completedResponse.body).toHaveProperty("status"); expect(completedResponse.body).toHaveProperty("status");
expect(completedResponse.body.status).toBe("completed"); expect(completedResponse.body.status).toBe("completed");

View File

@ -60,8 +60,11 @@ export class WebCrawler {
.filter((link) => { .filter((link) => {
const url = new URL(link); const url = new URL(link);
const path = url.pathname; const path = url.pathname;
const depth = url.pathname.split('/').length - 1;
const pathSplits = new URL(url).pathname.split('/');
const depth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1;
// Check if the link exceeds the maximum depth allowed // Check if the link exceeds the maximum depth allowed
if (depth > maxDepth) { if (depth > maxDepth) {
return false; return false;
@ -136,8 +139,10 @@ export class WebCrawler {
if(!crawlerOptions?.ignoreSitemap){ if(!crawlerOptions?.ignoreSitemap){
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) { if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks.map(link => ({ url: link, html: "" })); return filteredLinks.map(link => ({ url: link, html: "" }));
} }
} }
@ -148,6 +153,7 @@ export class WebCrawler {
concurrencyLimit, concurrencyLimit,
inProgress inProgress
); );
if ( if (
urls.length === 0 && urls.length === 0 &&

View File

@ -164,9 +164,9 @@ export class WebScraperDataProvider {
): Promise<Document[]> { ): Promise<Document[]> {
const pathSplits = new URL(this.urls[0]).pathname.split('/'); const pathSplits = new URL(this.urls[0]).pathname.split('/');
const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0); const baseURLDepth = pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) -1;
const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth; const adjustedMaxDepth = this.maxCrawledDepth + baseURLDepth;
const crawler = new WebCrawler({ const crawler = new WebCrawler({
initialUrl: this.urls[0], initialUrl: this.urls[0],
includes: this.includes, includes: this.includes,