diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 69a8a243..0b24d070 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,6 +13,8 @@ env: HOST: ${{ secrets.HOST }} LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} + POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} + POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} @@ -38,7 +40,7 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v3 with: - node-version: '20' + node-version: "20" - name: Install pnpm run: npm install -g pnpm - name: Install dependencies @@ -55,4 +57,4 @@ jobs: - name: Run E2E tests run: | npm run test:prod - working-directory: ./apps/api \ No newline at end of file + working-directory: ./apps/api diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index ddeee55d..5e48017b 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -13,6 +13,8 @@ env: HOST: ${{ secrets.HOST }} LLAMAPARSE_API_KEY: ${{ secrets.LLAMAPARSE_API_KEY }} LOGTAIL_KEY: ${{ secrets.LOGTAIL_KEY }} + POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} + POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} @@ -38,7 +40,7 @@ jobs: - name: Set up Node.js uses: actions/setup-node@v3 with: - node-version: '20' + node-version: "20" - name: Install pnpm run: npm install -g pnpm - name: Install dependencies @@ -68,4 +70,3 @@ jobs: - run: flyctl deploy ./apps/api --remote-only -a firecrawl-scraper-js env: FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} - diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 733c7877..87d8c286 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,26 +1,26 @@ -# Contributors guide: +# Contributors guide: -Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute) +Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute) If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue! - ## Running the project locally First, start by installing dependencies + 1. node.js [instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs) 2. pnpm [instructions](https://pnpm.io/installation) -3. redis [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/) +3. redis [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/) - -Set environment variables in a .env in the /apps/api/ directoryyou can copy over the template in .env.example. +Set environment variables in a .env in the /apps/api/ directoryyou can copy over the template in .env.example. To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features ) .env: + ``` # ===== Required ENVS ====== -NUM_WORKERS_PER_QUEUE=8 +NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://localhost:6379 @@ -31,8 +31,8 @@ USE_DB_AUTHENTICATION=false # ===== Optional ENVS ====== # Supabase Setup (used to support DB authentication, advanced logging, etc.) -SUPABASE_ANON_TOKEN= -SUPABASE_URL= +SUPABASE_ANON_TOKEN= +SUPABASE_URL= SUPABASE_SERVICE_TOKEN= # Other Optionals @@ -43,6 +43,11 @@ BULL_AUTH_KEY= # LOGTAIL_KEY= # Use if you're configuring basic logging with logtail PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs +SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api +SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages +POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs +POSTHOG_HOST= # set if you'd like to send posthog events like job logs + ``` @@ -56,7 +61,7 @@ pnpm install ### Running the project -You're going to need to open 3 terminals. +You're going to need to open 3 terminals. ### Terminal 1 - setting up redis @@ -69,6 +74,7 @@ redis-server ### Terminal 2 - setting up workers Now, navigate to the apps/api/ directory and run: + ```bash pnpm run workers ``` @@ -77,7 +83,6 @@ This will start the workers who are responsible for processing crawl jobs. ### Terminal 3 - setting up the main server - To do this, navigate to the apps/api/ directory and run if you don’t have this already, install pnpm here: https://pnpm.io/installation Next, run your server with: @@ -91,11 +96,11 @@ Alright: now let’s send our first request. ```curl curl -X GET http://localhost:3002/test -``` +``` + This should return the response Hello, world! - -If you’d like to test the crawl endpoint, you can run this +If you’d like to test the crawl endpoint, you can run this ```curl curl -X POST http://localhost:3002/v0/crawl \ @@ -103,12 +108,10 @@ curl -X POST http://localhost:3002/v0/crawl \ -d '{ "url": "https://mendable.ai" }' -``` - +``` + ## Tests: The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication. If you'd like to run the tests with authentication, run `npm run test:prod` - - diff --git a/apps/api/.env.example b/apps/api/.env.example index e33c5f4e..b025326e 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -24,3 +24,6 @@ PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages +POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs +POSTHOG_HOST= # set if you'd like to send posthog events like job logs + diff --git a/apps/api/package.json b/apps/api/package.json index 047feafe..a79e3dc7 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -82,6 +82,7 @@ "openai": "^4.28.4", "pdf-parse": "^1.1.1", "pos": "^0.4.2", + "posthog-node": "^4.0.1", "promptable": "^0.0.9", "puppeteer": "^22.6.3", "rate-limiter-flexible": "^2.4.2", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index bd5e37b1..78733752 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -128,6 +128,9 @@ dependencies: pos: specifier: ^0.4.2 version: 0.4.2 + posthog-node: + specifier: ^4.0.1 + version: 4.0.1 promptable: specifier: ^0.0.9 version: 0.0.9 @@ -5068,6 +5071,16 @@ packages: source-map-js: 1.0.2 dev: false + /posthog-node@4.0.1: + resolution: {integrity: sha512-rtqm2h22QxLGBrW2bLYzbRhliIrqgZ0k+gF0LkQ1SNdeD06YE5eilV0MxZppFSxC8TfH0+B0cWCuebEnreIDgQ==} + engines: {node: '>=15.0.0'} + dependencies: + axios: 1.6.7 + rusha: 0.8.14 + transitivePeerDependencies: + - debug + dev: false + /prelude-ls@1.1.2: resolution: {integrity: sha512-ESF23V4SKG6lVSGZgYNpbsiaAkdab6ZgOxe52p7+Kid3W3u3bxR4Vfd/o21dmN7jSt0IwgZ4v5MUd26FEtXE9w==} engines: {node: '>= 0.8.0'} @@ -5330,6 +5343,10 @@ packages: engines: {node: '>=10.0.0'} dev: false + /rusha@0.8.14: + resolution: {integrity: sha512-cLgakCUf6PedEu15t8kbsjnwIFFR2D4RfL+W3iWFJ4iac7z4B0ZI8fxy4R3J956kAI68HclCFGL8MPoUVC3qVA==} + dev: false + /safe-buffer@5.2.1: resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index 356fe763..c443e71d 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -25,6 +25,8 @@ describe("E2E Tests for API Routes with No Authentication", () => { process.env.PLAYWRIGHT_MICROSERVICE_URL = ""; process.env.LLAMAPARSE_API_KEY = ""; process.env.TEST_API_KEY = ""; + process.env.POSTHOG_API_KEY = ""; + process.env.POSTHOG_HOST = ""; }); // restore original process.env diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 92a1dc1b..83e0bf37 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -1,6 +1,7 @@ import { ExtractorOptions } from './../../lib/entities'; import { supabase_service } from "../supabase"; import { FirecrawlJob } from "../../types"; +import { posthog } from "../posthog"; import "dotenv/config"; export async function logJob(job: FirecrawlJob) { @@ -9,7 +10,6 @@ export async function logJob(job: FirecrawlJob) { if (process.env.ENV !== "production") { return; } - const { data, error } = await supabase_service .from("firecrawl_jobs") @@ -30,6 +30,27 @@ export async function logJob(job: FirecrawlJob) { num_tokens: job.num_tokens }, ]); + + if (process.env.POSTHOG_API_KEY) { + posthog.capture({ + distinctId: job.team_id === "preview" ? null : job.team_id, + event: "job-logged", + properties: { + success: job.success, + message: job.message, + num_docs: job.num_docs, + time_taken: job.time_taken, + team_id: job.team_id === "preview" ? null : job.team_id, + mode: job.mode, + url: job.url, + crawler_options: job.crawlerOptions, + page_options: job.pageOptions, + origin: job.origin, + extractor_options: job.extractor_options, + num_tokens: job.num_tokens + }, + }); + } if (error) { console.error("Error logging job:\n", error); } diff --git a/apps/api/src/services/posthog.ts b/apps/api/src/services/posthog.ts new file mode 100644 index 00000000..5ec16e2e --- /dev/null +++ b/apps/api/src/services/posthog.ts @@ -0,0 +1,26 @@ +import { PostHog } from 'posthog-node'; +import "dotenv/config"; + +export default function PostHogClient() { + const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, { + host: process.env.POSTHOG_HOST, + flushAt: 1, + flushInterval: 0 + }); + return posthogClient; +} + +class MockPostHog { + capture() {} +} + +// Using the actual PostHog class if POSTHOG_API_KEY exists, otherwise using the mock class +// Additionally, print a warning to the terminal if POSTHOG_API_KEY is not provided +export const posthog = process.env.POSTHOG_API_KEY + ? PostHogClient() + : (() => { + console.warn( + "POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more." + ); + return new MockPostHog(); + })(); \ No newline at end of file