2024-05-16 02:50:50 +08:00
import request from "supertest" ;
import dotenv from "dotenv" ;
import { WebsiteScrapeError } from "../utils/types" ;
import { logErrors } from "../utils/log" ;
import websitesData from "../data/crawl.json" ;
import "dotenv/config" ;
import fs from 'fs' ;
dotenv . config ( ) ;
interface WebsiteData {
website : string ;
expected_min_num_of_pages : number ;
expected_crawled_pages : string [ ] ;
}
const TEST_URL = "http://127.0.0.1:3002" ;
describe ( "Crawling Checkup (E2E)" , ( ) = > {
beforeAll ( ( ) = > {
if ( ! process . env . TEST_API_KEY ) {
throw new Error ( "TEST_API_KEY is not set" ) ;
}
} ) ;
describe ( "Crawling website tests with a dataset" , ( ) = > {
it ( "Should crawl the website and verify the response" , async ( ) = > {
let passedTests = 0 ;
const startTime = new Date ( ) . getTime ( ) ;
const date = new Date ( ) ;
const logsDir = ` logs/ ${ date . getMonth ( ) + 1 } - ${ date . getDate ( ) } - ${ date . getFullYear ( ) } ` ;
let errorLogFileName = ` ${ logsDir } /run.log_ ${ new Date ( ) . toTimeString ( ) . split ( ' ' ) [ 0 ] } ` ;
const errorLog : WebsiteScrapeError [ ] = [ ] ;
2024-05-16 03:50:13 +08:00
for ( const websiteData of websitesData ) {
try {
const crawlResponse = await request ( TEST_URL || "" )
. post ( "/v0/crawl" )
. set ( "Content-Type" , "application/json" )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` )
2024-05-16 04:16:03 +08:00
. send ( { url : websiteData.website , pageOptions : { onlyMainContent : true } , crawlerOptions : { limit : 100 , returnOnlyUrls : true } } ) ;
2024-05-16 03:50:13 +08:00
const jobId = crawlResponse . body . jobId ;
2024-05-16 04:16:03 +08:00
let completedResponse : any ;
2024-05-16 03:50:13 +08:00
let isFinished = false ;
while ( ! isFinished ) {
completedResponse = await request ( TEST_URL )
. get ( ` /v0/crawl/status/ ${ jobId } ` )
. set ( "Authorization" , ` Bearer ${ process . env . TEST_API_KEY } ` ) ;
isFinished = completedResponse . body . status === "completed" ;
if ( ! isFinished ) {
await new Promise ( resolve = > setTimeout ( resolve , 1000 ) ) ; // Wait for 1 second before checking again
2024-05-16 02:50:50 +08:00
}
2024-05-16 03:50:13 +08:00
}
if ( ! completedResponse ) {
// fail the test
console . log ( 'No response' ) ;
continue ;
2024-05-16 05:35:09 +08:00
// continue;
2024-05-16 03:50:13 +08:00
}
2024-05-16 04:16:03 +08:00
if ( ! completedResponse . body || completedResponse . body . status !== "completed" ) {
errorLog . push ( {
website : websiteData.website ,
prompt : 'CRAWL' ,
expected_output : 'SUCCESS' ,
actual_output : 'FAILURE' ,
error : ` Crawl job did not complete successfully. `
} ) ;
2024-05-16 05:35:09 +08:00
continue ;
2024-05-16 03:50:13 +08:00
}
2024-05-16 04:16:03 +08:00
// check how many webpages were crawled successfully
// compares with expected_num_of_pages
if ( completedResponse . body . data . length < websiteData . expected_min_num_of_pages ) {
errorLog . push ( {
website : websiteData.website ,
prompt : 'CRAWL' ,
expected_output : ` SUCCESS: ${ websiteData . expected_min_num_of_pages } ` ,
actual_output : ` FAILURE: ${ completedResponse . body . data . length } ` ,
error : ` Expected at least ${ websiteData . expected_min_num_of_pages } webpages, but got ${ completedResponse . body . data . length } `
} ) ;
2024-05-16 08:13:04 +08:00
console . log ( 'Error: ' , errorLog ) ;
2024-05-16 05:35:09 +08:00
continue ;
2024-05-16 04:16:03 +08:00
}
// checks if crawled pages contain expected_crawled_pages
2024-05-16 05:35:09 +08:00
if ( websiteData . expected_crawled_pages && websiteData . expected_crawled_pages . length > 0 && websiteData . expected_crawled_pages . some ( page = > ! completedResponse . body . data ? . some ( ( d : { url : string } ) = > d . url === page ) ) ) {
2024-05-16 04:16:03 +08:00
errorLog . push ( {
website : websiteData.website ,
prompt : 'CRAWL' ,
expected_output : ` SUCCESS: ${ websiteData . expected_crawled_pages } ` ,
2024-07-06 01:49:03 +08:00
actual_output : ` FAILURE: ${ completedResponse . body . data . map ( ( d : { url : string } ) => d.url)} ` ,
error : ` Expected crawled pages to contain ${ websiteData . expected_crawled_pages } , but got ${ completedResponse . body . data . map ( ( d : { url : string } ) => d.url)} `
2024-05-16 04:16:03 +08:00
} ) ;
2024-05-16 08:13:04 +08:00
console . log ( 'Error: ' , errorLog ) ;
2024-05-16 05:35:09 +08:00
continue ;
}
// checks if crawled pages not contain expected_not_crawled_pages
2024-05-16 05:54:40 +08:00
if ( websiteData . expected_not_crawled_pages && websiteData . expected_not_crawled_pages . length > 0 && completedResponse . body . data && websiteData . expected_not_crawled_pages . filter ( page = > completedResponse . body . data . some ( ( d : { url : string } ) = > d . url === page ) ) . length > 0 ) {
2024-05-16 05:35:09 +08:00
errorLog . push ( {
website : websiteData.website ,
prompt : 'CRAWL' ,
expected_output : ` SUCCESS: ${ websiteData . expected_not_crawled_pages } ` ,
2024-07-06 01:49:03 +08:00
actual_output : ` FAILURE: ${ completedResponse . body . data . map ( ( d : { url : string } ) => d.url)} ` ,
error : ` Expected crawled pages to not contain ${ websiteData . expected_not_crawled_pages } , but got ${ completedResponse . body . data . map ( ( d : { url : string } ) => d.url)} `
2024-05-16 05:35:09 +08:00
} ) ;
2024-05-16 08:13:04 +08:00
console . log ( 'Error: ' , errorLog ) ;
2024-05-16 05:35:09 +08:00
continue ;
2024-05-16 04:16:03 +08:00
}
2024-05-16 03:50:13 +08:00
passedTests ++ ;
} catch ( error ) {
console . error ( ` Error processing ${ websiteData . website } : ${ error } ` ) ;
errorLog . push ( {
website : websiteData.website ,
prompt : 'CRAWL' ,
expected_output : 'SUCCESS' ,
actual_output : 'FAILURE' ,
error : ` Error processing ${ websiteData . website } : ${ error } `
} ) ;
2024-05-16 05:35:09 +08:00
continue ;
2024-05-16 03:50:13 +08:00
}
2024-05-16 02:50:50 +08:00
}
const score = ( passedTests / websitesData . length ) * 100 ;
const endTime = new Date ( ) . getTime ( ) ;
const timeTaken = ( endTime - startTime ) / 1000 ;
console . log ( ` Score: ${ score } % ` ) ;
await logErrors ( errorLog , timeTaken , 0 , score , websitesData . length ) ;
if ( process . env . ENV === "local" && errorLog . length > 0 ) {
if ( ! fs . existsSync ( logsDir ) ) {
fs . mkdirSync ( logsDir , { recursive : true } ) ;
}
fs . writeFileSync ( errorLogFileName , JSON . stringify ( errorLog , null , 2 ) ) ;
}
2024-05-16 08:13:04 +08:00
expect ( score ) . toBeGreaterThanOrEqual ( 90 ) ;
2024-05-16 02:50:50 +08:00
} , 350000 ) ; // 150 seconds timeout
} ) ;
2024-05-16 03:50:13 +08:00
} ) ;