From 9c539e911372e814819fdf4d6623ddc97af5f64b Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 18 Jun 2024 16:26:54 -0300 Subject: [PATCH] Fixed includeHTML to use cleanedHtml as response --- apps/api/package.json | 1 + apps/api/src/scraper/WebScraper/single_url.ts | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 407f4c55..0668f663 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -12,6 +12,7 @@ "build": "tsc", "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", + "test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'", "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'", "workers": "nodemon --exec ts-node src/services/queue-worker.ts", "worker:production": "node dist/src/services/queue-worker.js", diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index e112cd45..3c7222c4 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -401,7 +401,7 @@ export async function scrapSingleUrl( return { text: await parseMarkdown(cleanedHtml), - html: scraperResponse.text, + html: cleanedHtml, screenshot: scraperResponse.screenshot, pageStatusCode: scraperResponse.metadata.pageStatusCode, pageError: scraperResponse.metadata.pageError || undefined @@ -428,7 +428,7 @@ export async function scrapSingleUrl( if (existingHtml && existingHtml.trim().length >= 100) { let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions); text = await parseMarkdown(cleanedHtml); - html = existingHtml; + html = cleanedHtml; break; }