diff --git a/apps/api/package.json b/apps/api/package.json index 8ae16099..07e3b7a5 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -11,8 +11,8 @@ "start:dev": "nodemon --exec ts-node src/index.ts", "build": "tsc", "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", - "test:local-no-auth":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", - "test:prod":"npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", + "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", + "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", "workers": "nodemon --exec ts-node src/services/queue-worker.ts", "worker:production": "node dist/src/services/queue-worker.js", "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", @@ -66,6 +66,7 @@ "glob": "^10.3.12", "gpt3-tokenizer": "^1.1.5", "ioredis": "^5.3.2", + "joplin-turndown-plugin-gfm": "^1.0.12", "keyword-extractor": "^0.0.25", "langchain": "^0.1.25", "languagedetect": "^2.0.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index df669d55..5298d2b7 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -80,6 +80,9 @@ dependencies: ioredis: specifier: ^5.3.2 version: 5.3.2 + joplin-turndown-plugin-gfm: + specifier: ^1.0.12 + version: 1.0.12 keyword-extractor: specifier: ^0.0.25 version: 0.0.25 @@ -3923,6 +3926,10 @@ packages: - ts-node dev: true + /joplin-turndown-plugin-gfm@1.0.12: + resolution: {integrity: sha512-qL4+1iycQjZ1fs8zk3jSRk7cg3ROBUHk7GKtiLAQLFzLPKErnILUvz5DLszSQvz3s1sTjPbywLDISVUtBY6HaA==} + dev: false + /js-tiktoken@1.0.10: resolution: {integrity: sha512-ZoSxbGjvGyMT13x6ACo9ebhDha/0FHdKA+OsQcMOWcm1Zs7r90Rhk5lhERLzji+3rA7EKpXCgwXcM5fF3DMpdA==} dependencies: diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 0fd8c938..e084f5ef 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,6 +1,8 @@ + export function parseMarkdown(html: string) { var TurndownService = require("turndown"); - var turndownPluginGfm = require("turndown-plugin-gfm"); + var turndownPluginGfm = require('joplin-turndown-plugin-gfm') + const turndownService = new TurndownService(); turndownService.addRule("inlineLink", { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index fbcd9238..0f3cc380 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -142,6 +142,7 @@ export async function scrapSingleUrl( break; } let cleanedHtml = removeUnwantedElements(text, pageOptions); + return [await parseMarkdown(cleanedHtml), text]; };