fix(scrapeURL, logger): remove buggy ArrayTransport that causes memory leak

2024-11-15 19:22:19 +08:00 · 2024-11-11 10:27:55 +01:00 · 2024-11-11 10:27:55 +01:00 · 49df553768
commit 49df553768
parent 84ad45c01f
3 changed files with 19 additions and 53 deletions
--- a/apps/api/src/lib/logger.ts
+++ b/apps/api/src/lib/logger.ts
@ -1,7 +1,6 @@
 import * as winston from "winston";

 import { configDotenv } from "dotenv";
-import Transport from "winston-transport";
 configDotenv();

 const logFormat = winston.format.printf(info => 
@ -50,33 +49,3 @@ export const logger = winston.createLogger({
    }),
  ],
 });
-
-export type ArrayTransportOptions = Transport.TransportStreamOptions & {
-  array: any[];
-  scrapeId?: string;
-};
-
-export class ArrayTransport extends Transport {
-  private array: any[];
-  private scrapeId?: string;
-
-  constructor(opts: ArrayTransportOptions) {
-    super(opts);
-    this.array = opts.array;
-    this.scrapeId = opts.scrapeId;
-  }
-
-  log(info, next) {
-    setImmediate(() => {
-      this.emit("logged", info);
-    });
-
-    if (this.scrapeId !== undefined && info.scrapeId !== this.scrapeId) {
-      return next();
-    }
-
-    this.array.push(info);
-
-    next();
-  }
-}
--- a/apps/api/src/scraper/scrapeURL/index.ts
+++ b/apps/api/src/scraper/scrapeURL/index.ts
@ -2,7 +2,7 @@ import { Logger } from "winston";
 import * as Sentry from "@sentry/node";

 import { Document, ScrapeOptions } from "../../controllers/v1/types";
-import { logger, ArrayTransport } from "../../lib/logger";
+import { logger } from "../../lib/logger";
 import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines";
 import { parseMarkdown } from "../../lib/html-to-markdown";
 import { AddFeatureError, EngineError, NoEnginesLeftError, TimeoutError } from "./error";
@ -97,9 +97,6 @@ function buildMetaObject(id: string, url: string, options: ScrapeOptions, intern

    const _logger = logger.child({ module: "ScrapeURL", scrapeId: id });
    const logs: any[] = [];
-    if (process.env.ENV !== "test") {
-        _logger.add(new ArrayTransport({ array: logs, scrapeId: id }));
-    }

    return {
        id, url, options, internalOptions,
--- a/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts
+++ b/apps/api/src/scraper/scrapeURL/scrapeURL.test.ts
@ -29,7 +29,7 @@ describe("Standalone scrapeURL tests", () => {
        it("Basic scrape", async () => {
            const out = await scrapeURL("test:scrape-basic", "https://www.roastmywebsite.ai/", scrapeOptions.parse({}), { forceEngine });
        
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -72,7 +72,7 @@ describe("Standalone scrapeURL tests", () => {
                formats: ["markdown", "html"],
            }), { forceEngine });
        
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -92,7 +92,7 @@ describe("Standalone scrapeURL tests", () => {
                onlyMainContent: false,
            }), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -110,7 +110,7 @@ describe("Standalone scrapeURL tests", () => {
                excludeTags: ['.nav', '#footer', 'strong'],
            }), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -125,7 +125,7 @@ describe("Standalone scrapeURL tests", () => {
        it("Scrape of a page with 400 status code", async () => {
            const out = await scrapeURL("test:scrape-400", "https://httpstat.us/400", scrapeOptions.parse({}), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -138,7 +138,7 @@ describe("Standalone scrapeURL tests", () => {
        it("Scrape of a page with 401 status code", async () => {
            const out = await scrapeURL("test:scrape-401", "https://httpstat.us/401", scrapeOptions.parse({}), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -151,7 +151,7 @@ describe("Standalone scrapeURL tests", () => {
        it("Scrape of a page with 403 status code", async () => {
            const out = await scrapeURL("test:scrape-403", "https://httpstat.us/403", scrapeOptions.parse({}), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -164,7 +164,7 @@ describe("Standalone scrapeURL tests", () => {
        it("Scrape of a page with 404 status code", async () => {
            const out = await scrapeURL("test:scrape-404", "https://httpstat.us/404", scrapeOptions.parse({}), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -177,7 +177,7 @@ describe("Standalone scrapeURL tests", () => {
        it("Scrape of a page with 405 status code", async () => {
            const out = await scrapeURL("test:scrape-405", "https://httpstat.us/405", scrapeOptions.parse({}), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -190,7 +190,7 @@ describe("Standalone scrapeURL tests", () => {
        it("Scrape of a page with 500 status code", async () => {
            const out = await scrapeURL("test:scrape-500", "https://httpstat.us/500", scrapeOptions.parse({}), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -203,7 +203,7 @@ describe("Standalone scrapeURL tests", () => {
        it("Scrape a redirected page", async () => {
            const out = await scrapeURL("test:scrape-redirect", "https://scrapethissite.com/", scrapeOptions.parse({}), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -224,7 +224,7 @@ describe("Standalone scrapeURL tests", () => {
                formats: ["screenshot"],
            }), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -243,7 +243,7 @@ describe("Standalone scrapeURL tests", () => {
                formats: ["screenshot@fullPage"],
            }), { forceEngine });
    
-            expect(out.logs.length).toBeGreaterThan(0);
+            // expect(out.logs.length).toBeGreaterThan(0);
            expect(out.success).toBe(true);
            if (out.success) {
                expect(out.document.warning).toBeUndefined();
@ -261,7 +261,7 @@ describe("Standalone scrapeURL tests", () => {
    it("Scrape of a PDF file", async () => {
        const out = await scrapeURL("test:scrape-pdf", "https://arxiv.org/pdf/astro-ph/9301001.pdf", scrapeOptions.parse({}));

-        expect(out.logs.length).toBeGreaterThan(0);
+        // expect(out.logs.length).toBeGreaterThan(0);
        expect(out.success).toBe(true);
        if (out.success) {
            expect(out.document.warning).toBeUndefined();
@ -275,7 +275,7 @@ describe("Standalone scrapeURL tests", () => {
    it("Scrape a DOCX file", async () => {
        const out = await scrapeURL("test:scrape-docx", "https://nvca.org/wp-content/uploads/2019/06/NVCA-Model-Document-Stock-Purchase-Agreement.docx", scrapeOptions.parse({}));

-        expect(out.logs.length).toBeGreaterThan(0);
+        // expect(out.logs.length).toBeGreaterThan(0);
        expect(out.success).toBe(true);
        if (out.success) {
            expect(out.document.warning).toBeUndefined();
@ -304,7 +304,7 @@ describe("Standalone scrapeURL tests", () => {
            },
        }));

-        expect(out.logs.length).toBeGreaterThan(0);
+        // expect(out.logs.length).toBeGreaterThan(0);
        expect(out.success).toBe(true);
        if (out.success) {
            expect(out.document.warning).toBeUndefined();
@ -335,7 +335,7 @@ describe("Standalone scrapeURL tests", () => {
            },
        }));

-        expect(out.logs.length).toBeGreaterThan(0);
+        // expect(out.logs.length).toBeGreaterThan(0);
        expect(out.success).toBe(true);
        if (out.success) {
            expect(out.document.warning).toBeUndefined();
@ -369,7 +369,7 @@ describe("Standalone scrapeURL tests", () => {
        }

        // verify that log collection works properly while concurrency is happening
-        expect(out.logs.length).toBeGreaterThan(0);
+        // expect(out.logs.length).toBeGreaterThan(0);
        const weirdLogs = out.logs.filter(x => x.scrapeId !== id);
        if (weirdLogs.length > 0) {
            console.warn(JSON.stringify(weirdLogs, replacer));