From b2ae1a52d5e92882c739857832dfb1d38f27644b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 9 Oct 2024 23:13:13 +0200 Subject: [PATCH 1/3] fix(Dockerfile): remove chromium --- apps/api/Dockerfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile index dce421d5..b908f679 100644 --- a/apps/api/Dockerfile +++ b/apps/api/Dockerfile @@ -28,9 +28,6 @@ RUN cd /app/src/lib/go-html-to-md && \ chmod +x html-to-markdown.so FROM base -RUN apt-get update -qq && \ - apt-get install --no-install-recommends -y chromium chromium-sandbox && \ - rm -rf /var/lib/apt/lists /var/cache/apt/archives COPY --from=prod-deps /app/node_modules /app/node_modules COPY --from=build /app /app COPY --from=go-base /app/src/lib/go-html-to-md/html-to-markdown.so /app/dist/src/lib/go-html-to-md/html-to-markdown.so From 17d0ed061ea6aa52710090b9db4da4451d15f104 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 9 Oct 2024 23:13:26 +0200 Subject: [PATCH 2/3] push --- apps/api/src/index.ts | 318 ++++++++++++++++++++---------------------- 1 file changed, 149 insertions(+), 169 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index cf905207..79439038 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -7,7 +7,6 @@ import cors from "cors"; import { getScrapeQueue } from "./services/queue-service"; import { v0Router } from "./routes/v0"; import { initSDK } from "@hyperdx/node-opentelemetry"; -import cluster from "cluster"; import os from "os"; import { Logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; @@ -37,198 +36,179 @@ const cacheable = new CacheableLookup({ cacheable.install(http.globalAgent); cacheable.install(https.globalAgent) -if (cluster.isMaster) { - Logger.info(`Master ${process.pid} is running`); +const ws = expressWs(express()); +const app = ws.app; - // Fork workers. - for (let i = 0; i < numCPUs; i++) { - cluster.fork(); - } +global.isProduction = process.env.IS_PRODUCTION === "true"; - cluster.on("exit", (worker, code, signal) => { - if (code !== null) { - Logger.info(`Worker ${worker.process.pid} exited`); - Logger.info("Starting a new worker"); - cluster.fork(); - } +app.use(bodyParser.urlencoded({ extended: true })); +app.use(bodyParser.json({ limit: "10mb" })); + +app.use(cors()); // Add this line to enable CORS + +const serverAdapter = new ExpressAdapter(); +serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); + +const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ + queues: [new BullAdapter(getScrapeQueue())], + serverAdapter: serverAdapter, +}); + +app.use( + `/admin/${process.env.BULL_AUTH_KEY}/queues`, + serverAdapter.getRouter() +); + +app.get("/", (req, res) => { + res.send("SCRAPERS-JS: Hello, world! Fly.io"); +}); + +//write a simple test function +app.get("/test", async (req, res) => { + res.send("Hello, world!"); +}); + +// register router +app.use(v0Router); +app.use("/v1", v1Router); +app.use(adminRouter); + +const DEFAULT_PORT = process.env.PORT ?? 3002; +const HOST = process.env.HOST ?? "localhost"; + +// HyperDX OpenTelemetry +if (process.env.ENV === "production") { + initSDK({ consoleCapture: true, additionalInstrumentations: [] }); +} + +function startServer(port = DEFAULT_PORT) { + const server = app.listen(Number(port), HOST, () => { + Logger.info(`Worker ${process.pid} listening on port ${port}`); + Logger.info( + `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + ); }); -} else { - const ws = expressWs(express()); - const app = ws.app; + return server; +} - global.isProduction = process.env.IS_PRODUCTION === "true"; +if (require.main === module) { + startServer(); +} - app.use(bodyParser.urlencoded({ extended: true })); - app.use(bodyParser.json({ limit: "10mb" })); +app.get(`/serverHealthCheck`, async (req, res) => { + try { + const scrapeQueue = getScrapeQueue(); + const [waitingJobs] = await Promise.all([ + scrapeQueue.getWaitingCount(), + ]); - app.use(cors()); // Add this line to enable CORS - - const serverAdapter = new ExpressAdapter(); - serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); - - const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ - queues: [new BullAdapter(getScrapeQueue())], - serverAdapter: serverAdapter, - }); - - app.use( - `/admin/${process.env.BULL_AUTH_KEY}/queues`, - serverAdapter.getRouter() - ); - - app.get("/", (req, res) => { - res.send("SCRAPERS-JS: Hello, world! Fly.io"); - }); - - //write a simple test function - app.get("/test", async (req, res) => { - res.send("Hello, world!"); - }); - - // register router - app.use(v0Router); - app.use("/v1", v1Router); - app.use(adminRouter); - - const DEFAULT_PORT = process.env.PORT ?? 3002; - const HOST = process.env.HOST ?? "localhost"; - - // HyperDX OpenTelemetry - if (process.env.ENV === "production") { - initSDK({ consoleCapture: true, additionalInstrumentations: [] }); - } - - function startServer(port = DEFAULT_PORT) { - const server = app.listen(Number(port), HOST, () => { - Logger.info(`Worker ${process.pid} listening on port ${port}`); - Logger.info( - `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` - ); + const noWaitingJobs = waitingJobs === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noWaitingJobs ? 200 : 500).json({ + waitingJobs, }); - return server; + } catch (error) { + Sentry.captureException(error); + Logger.error(error); + return res.status(500).json({ error: error.message }); } +}); - if (require.main === module) { - startServer(); - } +app.get("/serverHealthCheck/notify", async (req, res) => { + if (process.env.SLACK_WEBHOOK_URL) { + const treshold = 1; // The treshold value for the active jobs + const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds - app.get(`/serverHealthCheck`, async (req, res) => { - try { + const getWaitingJobsCount = async () => { const scrapeQueue = getScrapeQueue(); - const [waitingJobs] = await Promise.all([ + const [waitingJobsCount] = await Promise.all([ scrapeQueue.getWaitingCount(), ]); - const noWaitingJobs = waitingJobs === 0; - // 200 if no active jobs, 503 if there are active jobs - return res.status(noWaitingJobs ? 200 : 500).json({ - waitingJobs, - }); - } catch (error) { - Sentry.captureException(error); - Logger.error(error); - return res.status(500).json({ error: error.message }); - } - }); + return waitingJobsCount; + }; - app.get("/serverHealthCheck/notify", async (req, res) => { - if (process.env.SLACK_WEBHOOK_URL) { - const treshold = 1; // The treshold value for the active jobs - const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds + res.status(200).json({ message: "Check initiated" }); - const getWaitingJobsCount = async () => { - const scrapeQueue = getScrapeQueue(); - const [waitingJobsCount] = await Promise.all([ - scrapeQueue.getWaitingCount(), - ]); + const checkWaitingJobs = async () => { + try { + let waitingJobsCount = await getWaitingJobsCount(); + if (waitingJobsCount >= treshold) { + setTimeout(async () => { + // Re-check the waiting jobs count after the timeout + waitingJobsCount = await getWaitingJobsCount(); + if (waitingJobsCount >= treshold) { + const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; + const message = { + text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${ + timeout / 60000 + } minute(s).`, + }; - return waitingJobsCount; - }; + const response = await fetch(slackWebhookUrl, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify(message), + }); - res.status(200).json({ message: "Check initiated" }); - - const checkWaitingJobs = async () => { - try { - let waitingJobsCount = await getWaitingJobsCount(); - if (waitingJobsCount >= treshold) { - setTimeout(async () => { - // Re-check the waiting jobs count after the timeout - waitingJobsCount = await getWaitingJobsCount(); - if (waitingJobsCount >= treshold) { - const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; - const message = { - text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${ - timeout / 60000 - } minute(s).`, - }; - - const response = await fetch(slackWebhookUrl, { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify(message), - }); - - if (!response.ok) { - Logger.error("Failed to send Slack notification"); - } + if (!response.ok) { + Logger.error("Failed to send Slack notification"); } - }, timeout); - } - } catch (error) { - Sentry.captureException(error); - Logger.debug(error); + } + }, timeout); } - }; - - checkWaitingJobs(); - } - }); - - app.get("/is-production", (req, res) => { - res.send({ isProduction: global.isProduction }); - }); - - app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { - if (err instanceof ZodError) { - if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) { - Logger.warn("Unsupported protocol error: " + JSON.stringify(req.body)); - } - - res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); - } else { - next(err); - } - }); - - Sentry.setupExpressErrorHandler(app); - - app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { - if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) { - return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' }); - } - - const id = res.sentry ?? uuidv4(); - let verbose = JSON.stringify(err); - if (verbose === "{}") { - if (err instanceof Error) { - verbose = JSON.stringify({ - message: err.message, - name: err.name, - stack: err.stack, - }); + } catch (error) { + Sentry.captureException(error); + Logger.debug(error); } + }; + + checkWaitingJobs(); + } +}); + +app.get("/is-production", (req, res) => { + res.send({ isProduction: global.isProduction }); +}); + +app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response, next: NextFunction) => { + if (err instanceof ZodError) { + if (Array.isArray(err.errors) && err.errors.find(x => x.message === "URL uses unsupported protocol")) { + Logger.warn("Unsupported protocol error: " + JSON.stringify(req.body)); + } + + res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); + } else { + next(err); + } +}); + +Sentry.setupExpressErrorHandler(app); + +app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry, next: NextFunction) => { + if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) { + return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' }); + } + + const id = res.sentry ?? uuidv4(); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); } + } - Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); - res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); - }); - - Logger.info(`Worker ${process.pid} started`); -} - + Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); + res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id }); +}); +Logger.info(`Worker ${process.pid} started`); // const sq = getScrapeQueue(); From 30a375d693e25d5fe62c1a9ff8586f790ba81949 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 9 Oct 2024 19:26:25 -0300 Subject: [PATCH 3/3] Nick: --- .github/workflows/fly-direct.yml | 46 ------------------ .github/workflows/fly.yml | 81 -------------------------------- 2 files changed, 127 deletions(-) delete mode 100644 .github/workflows/fly-direct.yml delete mode 100644 .github/workflows/fly.yml diff --git a/.github/workflows/fly-direct.yml b/.github/workflows/fly-direct.yml deleted file mode 100644 index 3ee460e6..00000000 --- a/.github/workflows/fly-direct.yml +++ /dev/null @@ -1,46 +0,0 @@ -name: Fly Deploy Direct -on: - schedule: - - cron: '0 * * * *' - -env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} - FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} - HOST: ${{ secrets.HOST }} - LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} - LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} - POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} - POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} - NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} - PORT: ${{ secrets.PORT }} - REDIS_URL: ${{ secrets.REDIS_URL }} - SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} - SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} - SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} - TEST_API_KEY: ${{ secrets.TEST_API_KEY }} - PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} - PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - NPM_TOKEN: ${{ secrets.NPM_TOKEN }} - CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} - SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} - USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} - ENV: ${{ secrets.ENV }} - -jobs: - deploy: - name: Deploy app - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - uses: actions/checkout@v3 - - uses: superfly/flyctl-actions/setup-flyctl@master - - run: flyctl deploy --remote-only -a firecrawl-scraper-js --build-secret SENTRY_AUTH_TOKEN=$SENTRY_AUTH_TOKEN - working-directory: ./apps/api - env: - FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} - BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} - SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml deleted file mode 100644 index ed8dade1..00000000 --- a/.github/workflows/fly.yml +++ /dev/null @@ -1,81 +0,0 @@ -name: Fly Deploy -on: - push: - branches: - - main - -env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} - FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} - HOST: ${{ secrets.HOST }} - LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} - LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} - POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} - POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} - NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} - PORT: ${{ secrets.PORT }} - REDIS_URL: ${{ secrets.REDIS_URL }} - SCRAPING_BEE_API_KEY: ${{ secrets.SCRAPING_BEE_API_KEY }} - SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} - SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} - TEST_API_KEY: ${{ secrets.TEST_API_KEY }} - PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} - PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - NPM_TOKEN: ${{ secrets.NPM_TOKEN }} - CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} - SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} - USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} - ENV: ${{ secrets.ENV }} - -jobs: - pre-deploy: - name: Pre-deploy checks - runs-on: ubuntu-latest - services: - redis: - image: redis - ports: - - 6379:6379 - steps: - - uses: actions/checkout@v3 - - name: Set up Node.js - uses: actions/setup-node@v3 - with: - node-version: "20" - - name: Install pnpm - run: npm install -g pnpm - - name: Install dependencies - run: pnpm install - working-directory: ./apps/api - - name: Start the application - run: npm start & - working-directory: ./apps/api - id: start_app - - name: Start workers - run: npm run workers & - working-directory: ./apps/api - id: start_workers - - name: Run E2E tests - run: | - npm run test:prod - working-directory: ./apps/api - - deploy: - name: Deploy app - needs: pre-deploy - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: superfly/flyctl-actions/setup-flyctl@master - - run: flyctl deploy --remote-only -a firecrawl-scraper-js --build-secret SENTRY_AUTH_TOKEN=$SENTRY_AUTH_TOKEN - working-directory: ./apps/api - env: - FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} - BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} - SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} - - \ No newline at end of file