diff --git a/.github/workflows/.keep b/.github/workflows/.keep new file mode 100644 index 0000000..e69de29 diff --git a/.gitignore b/.gitignore index c6bba59..9dadba9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,130 +1,3 @@ -# Logs -logs -*.log -npm-debug.log* -yarn-debug.log* -yarn-error.log* -lerna-debug.log* -.pnpm-debug.log* - -# Diagnostic reports (https://nodejs.org/api/report.html) -report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json - -# Runtime data -pids -*.pid -*.seed -*.pid.lock - -# Directory for instrumented libs generated by jscoverage/JSCover -lib-cov - -# Coverage directory used by tools like istanbul -coverage -*.lcov - -# nyc test coverage -.nyc_output - -# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) -.grunt - -# Bower dependency directory (https://bower.io/) -bower_components - -# node-waf configuration -.lock-wscript - -# Compiled binary addons (https://nodejs.org/api/addons.html) -build/Release - -# Dependency directories +package-lock.json node_modules/ -jspm_packages/ - -# Snowpack dependency directory (https://snowpack.dev/) -web_modules/ - -# TypeScript cache -*.tsbuildinfo - -# Optional npm cache directory -.npm - -# Optional eslint cache -.eslintcache - -# Optional stylelint cache -.stylelintcache - -# Microbundle cache -.rpt2_cache/ -.rts2_cache_cjs/ -.rts2_cache_es/ -.rts2_cache_umd/ - -# Optional REPL history -.node_repl_history - -# Output of 'npm pack' -*.tgz - -# Yarn Integrity file -.yarn-integrity - -# dotenv environment variable files -.env -.env.development.local -.env.test.local -.env.production.local -.env.local - -# parcel-bundler cache (https://parceljs.org/) -.cache -.parcel-cache - -# Next.js build output -.next -out - -# Nuxt.js build / generate output -.nuxt -dist - -# Gatsby files -.cache/ -# Comment in the public line in if your project uses Gatsby and not Next.js -# https://nextjs.org/blog/next-9-1#public-directory-support -# public - -# vuepress build output -.vuepress/dist - -# vuepress v2.x temp and cache directory -.temp -.cache - -# Docusaurus cache and generated files -.docusaurus - -# Serverless directories -.serverless/ - -# FuseBox cache -.fusebox/ - -# DynamoDB Local files -.dynamodb/ - -# TernJS port file -.tern-port - -# Stores VSCode versions used for testing VSCode extensions -.vscode-test - -# yarn v2 -.yarn/cache -.yarn/unplugged -.yarn/build-state.yml -.yarn/install-state.gz -.pnp.* +.DS_Store \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..8fddc71 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "thinapps-shared"] + path = thinapps-shared + url = git@github.com:jina-ai/thinapps-shared.git diff --git a/.vscode/exensions.json b/.vscode/exensions.json new file mode 100644 index 0000000..37bdb37 --- /dev/null +++ b/.vscode/exensions.json @@ -0,0 +1,10 @@ +{ + "recommendations": [ + "editorconfig.editorconfig", + "octref.vetur", + "redhat.vscode-yaml", + "dbaeumer.vscode-eslint", + "esbenp.prettier-vscode", + "streetsidesoftware.code-spell-checker" + ] +} \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..c7cab1b --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,60 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Debug Fullstack: attach", + "request": "attach", + "cwd": "${workspaceFolder}/backend/functions", + "skipFiles": [ + "/**" + ], + "type": "node", + "preLaunchTask": "Fullstack:debug" + }, + { + "name": "Debug Fullstack: attach: with proxy", + "request": "attach", + "cwd": "${workspaceFolder}/backend/functions", + "skipFiles": [ + "/**" + ], + "type": "node", + "preLaunchTask": "Fullstack:debug:with-proxy" + }, + { + "name": "Attach", + "port": 9229, + "request": "attach", + "skipFiles": [ + "/**" + ], + "type": "node" + }, + { + "name": "Attach by Process ID", + "processId": "${command:PickProcess}", + "request": "attach", + "skipFiles": [ + "/**" + ], + "type": "node" + }, + { + "name": "Debug Fullstack", + "request": "launch", + "runtimeArgs": [ + "emulators:start", + "--import=../.firebase-emu", + "--export-on-exit=../.firebase-emu", + ], + "cwd": "${workspaceFolder}/backend/functions", + "runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase", + "skipFiles": [ + "/**" + ], + "type": "node", + "preLaunchTask": "Fullstack:prepare", + "killBehavior": "polite" + }, + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1916e11 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,105 @@ +{ + "editor.wordWrap": "on", + "editor.wordWrapColumn": 120, + "files.trimTrailingWhitespace": true, + "files.trimFinalNewlines": true, + "[javascript]": { + "editor.defaultFormatter": "vscode.typescript-language-features" + }, + "[vue]": { + "editor.defaultFormatter": "Vue.volar" + }, + "[jsonc]": { + "editor.defaultFormatter": "vscode.json-language-features" + }, + "[typescript]": { + "editor.defaultFormatter": "vscode.typescript-language-features" + }, + "[json]": { + "editor.defaultFormatter": "vscode.json-language-features" + }, + "[yaml]": { + "editor.defaultFormatter": "redhat.vscode-yaml" + }, + "[markdown]": { + "files.trimTrailingWhitespace": false + }, + "typescript.tsdk": "node_modules/typescript/lib", + "vetur.format.defaultFormatter.ts": "vscode-typescript", + "vetur.format.defaultFormatter.js": "vscode-typescript", + "typescript.preferences.quoteStyle": "single", + "typescript.format.semicolons": "insert", + "typescript.preferences.importModuleSpecifier": "project-relative", + "typescript.locale": "en", + "cSpell.enabled": true, + "cSpell.words": [ + "Apiextensions", + "apihubble", + "auths", + "AUTOCASTABLE", + "Autocasting", + "backchannel", + "bodyparser", + "bson", + "BUILDKIT", + "buildx", + "castable", + "cmdl", + "Commandline", + "conpty", + "cpid", + "deferreds", + "DEVBOT", + "dockerhub", + "entrypoint", + "ENVIROMENT", + "finetuner", + "fpath", + "fswalk", + "Grafana", + "Hasher", + "istio", + "jina", + "jinahub", + "jinameta", + "Knative", + "kourier", + "kube", + "kubectl", + "Kubernetes", + "kwargs", + "letsencrypt", + "liveconfigs", + "LOGNAME", + "metas", + "Mgmt", + "middlewares", + "minikube", + "minio", + "ndjson", + "nodelib", + "oidc", + "openapi", + "paramtypes", + "penv", + "pino", + "prebuild", + "quickstart", + "reinit", + "sslip", + "subval", + "Succ", + "timedout", + "TOTP", + "tsbuildinfo", + "tsyringe", + "typeclass", + "upsert", + "upserted", + "userinfo", + "Vecs", + "vectorize", + "WECHAT", + "WXPAY" + ], +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..fc4489b --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,156 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "npm", + "script": "build", + "group": "build", + "options": { + "cwd": "${workspaceFolder}/backend/functions" + }, + "problemMatcher": [], + "label": "Backend:rebuild", + "detail": "Backend:rebuild" + }, + { + "type": "npm", + "script": "emu:reset", + "group": "build", + "options": { + "cwd": "${workspaceFolder}/backend/functions" + }, + "problemMatcher": [], + "label": "Backend:reset-emulator", + "detail": "Backend:reset-emulator" + }, + { + "type": "typescript", + "options": { + "cwd": "${workspaceFolder}/backend/functions" + }, + "tsconfig": "backend/functions/tsconfig.json", + "option": "watch", + "isBackground": true, + "problemMatcher": [ + "$tsc-watch" + ], + "group": "build", + "label": "Backend:build:watch" + }, + { + "type": "npm", + "script": "emu:debug", + "group": "none", + "options": { + "cwd": "${workspaceFolder}/backend/functions" + }, + "problemMatcher": [ + { + "base": "$tsc", + "background": { + "activeOnStart": false, + "beginsPattern": "shutdown requested|Starting emulators", + "endsPattern": "Debugger listening" + } + } + ], + "label": "Backend:start-emulator-debug", + "detail": "Backend:start-emulator-debug", + "dependsOn": [ + "Backend:build:watch" + ], + "isBackground": true, + }, + { + "type": "npm", + "script": "dev", + "options": { + "cwd": "${workspaceFolder}/webapp", + }, + "group": "build", + "label": "Frontend:start:dev", + "detail": "Frontend:start:dev", + "isBackground": true, + "problemMatcher": { + "base": "$vite", + "background": { + "activeOnStart": true, + "endsPattern": "OK", + "beginsPattern": "vite" + } + }, + }, + { + "type": "npm", + "script": "dev", + "options": { + "cwd": "${workspaceFolder}/webapp", + "env": { + "FIREBASE_EMULATE": "true", + } + }, + "group": "build", + "label": "Frontend:start:emu", + "detail": "Frontend:start:emu", + "isBackground": true, + "problemMatcher": { + "base": "$vite", + "background": { + "activeOnStart": true, + "endsPattern": "OK", + "beginsPattern": "vite" + } + }, + }, + { + "type": "npm", + "script": "emu:debug2", + "group": "none", + "options": { + "cwd": "${workspaceFolder}/backend/functions", + "env": { + "https_proxy": "http://127.0.0.1:7890", + "http_proxy": "http://127.0.0.1:7890", + "all_proxy": "socks5://127.0.0.1:7890" + } + }, + "problemMatcher": [ + { + "base": "$tsc", + "background": { + "activeOnStart": false, + "beginsPattern": "shutdown requested|Starting emulators", + "endsPattern": "Debugger listening" + } + } + ], + "label": "Backend:start-emulator-debug:with-proxy", + "detail": "Backend:start-emulator-debug:with-proxy", + "dependsOn": [ + "Backend:build:watch" + ], + "isBackground": true, + }, + { + "label": "Fullstack:prepare", + "dependsOn": [ + "Frontend:start:emu", + "Backend:build:watch", + ], + }, + { + "label": "Fullstack:debug", + "dependsOn": [ + // "Frontend:start:emu", + "Backend:start-emulator-debug", + ], + }, + { + "label": "Fullstack:debug:with-proxy", + "dependsOn": [ + "Frontend:start:emu", + "Backend:start-emulator-debug:with-proxy", + ], + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index d2f0ba7..904625e 100644 --- a/README.md +++ b/README.md @@ -1 +1,112 @@ -# url2text \ No newline at end of file +# Url2Text + +## Development Guide + +### Prerequisite +- Node v18 (The build fails for Node version >18) +- Yarn +- Firebase CLI (`npm install -g firebase-tools`) + +### Installation + +Clone the scenex repo by running the command: + +```bash +git clone git@github.com:jina-ai/url2text.git +git submodule init +git submodule update +``` + +After a successful clone, install the packages for backend and the webapp. + +For backend, go to the `backend/functions` directory and install the npm dependencies. + +```bash +cd backend/functions +npm install +``` + +For the frontend (webapp), go to the `webapp` directory and install the yarn dependencies. + +```bash +cd webapp +yarn +``` + +### Configure + +**Establish localhost connection:** + +Once the packages are installed, go to the `App.vue` file inside the `webapp/src/` and uncomment the below code: + +```js +connectFunctionsEmulator(functions, 'localhost', 5001); +``` + +### Run The Application Now + +To run the backend server, inside the `backend/functions` dir run the below command: + +```bash +npm run serve +``` + +To run the frontend app, inside the `webapp` dir run the below command: + +```bash +yarn dev +``` + +### Known Errors + +1. If you encounter 'npm ERR! /bin/sh: pkg-config: command not found' error in Mac, run the command `brew install pkg-config cairo libpng jpeg giflib pango librsvg` + +## Best practices + +### Directory structure + +There are three folders: +1. `webapp` is the frontend project of `SceneX`, knowledge requirements: +- Vue 3 +- Quasar +- ... + +2. `backend` contains source code of backend logic, knowledge requirements: +- Nodejs +- Firebase +- ... + +3. `scripts` folder includes custom scripts we might need during the development or for production, currently we have the following scripts: +- `translate` is responsible for translating and updating our i18n language files in frontend project. + +### Best practices of frontend +1. **Quasar docs** is your `best friend`. Since the frontend project highly depends on framework `Quasar`. It is recommended to use the predefined classes and components and avoid defining your custom classes +2. **Double check** of the UI output in `Dark mode` and `Light mode`. Again, use predefined classes and props. +3. **Plugins in boot** folder: create corresponding file in `boot` folder and use them in `quasar.config.js`: +```js +module.exports = configure(function() { + return { + ... + boot: [ + 'i18n', + 'axios', + 'firebase', + 'addressbar-color', + 'quasar-lang-pack' + ], + ... + } +}) + +``` + +### Best practices of backend +1. **Remember to deploy your functions** by running: +```bash +# deploy all functions +firebase deploy --only functions + +# deploy a specific function +firebase deploy --only functions:{function name} + +``` diff --git a/backend/.firebaserc b/backend/.firebaserc new file mode 100644 index 0000000..f585142 --- /dev/null +++ b/backend/.firebaserc @@ -0,0 +1,5 @@ +{ + "projects": { + "default": "reader-6b7dc" + } +} diff --git a/backend/.gitignore b/backend/.gitignore new file mode 100644 index 0000000..3f217cc --- /dev/null +++ b/backend/.gitignore @@ -0,0 +1,75 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +firebase-debug.log* +firebase-debug.*.log* + +# Firebase cache +.firebase/ + +# Firebase config + +# Uncomment this if you'd like others to create their own Firebase project. +# For a team working on the same Firebase project(s), it is recommended to leave +# it commented so all members can deploy to the same project(s) in .firebaserc. +# .firebaserc + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (http://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +node_modules/ + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variables file +.env +.secret.local + +toy*.ts + +.DS_Store +build/ +.firebase-emu/ +*.log +.DS_Store diff --git a/backend/firebase.json b/backend/firebase.json new file mode 100644 index 0000000..42ff309 --- /dev/null +++ b/backend/firebase.json @@ -0,0 +1,44 @@ +{ + "firestore": { + "rules": "firestore.rules", + "indexes": "firestore.indexes.json" + }, + "functions": [ + { + "source": "functions", + "codebase": "default", + "ignore": [ + "node_modules", + "src", + ".git", + "firebase-debug.log", + "firebase-debug.*.log" + ], + "predeploy": [ + "npm --prefix \"$RESOURCE_DIR\" run build:clean", + "npm --prefix \"$RESOURCE_DIR\" run build" + ] + } + ], + "storage": { + "rules": "storage.rules" + }, + "emulators": { + "ui": { + "enabled": true + }, + "singleProjectMode": true, + "functions": { + "port": 5001 + }, + "auth": { + "port": 9099 + }, + "firestore": { + "port": 9098 + }, + "storage": { + "port": 9097 + } + } +} \ No newline at end of file diff --git a/backend/firestore.indexes.json b/backend/firestore.indexes.json new file mode 100644 index 0000000..a4752ee --- /dev/null +++ b/backend/firestore.indexes.json @@ -0,0 +1,19 @@ +{ + "indexes": [ + { + "collectionGroup": "prompts", + "queryScope": "COLLECTION_GROUP", + "fields": [ + { + "fieldPath": "id", + "order": "ASCENDING" + }, + { + "fieldPath": "isPublic", + "order": "ASCENDING" + } + ] + } + ], + "fieldOverrides": [] +} \ No newline at end of file diff --git a/backend/firestore.rules b/backend/firestore.rules new file mode 100644 index 0000000..09d4ede --- /dev/null +++ b/backend/firestore.rules @@ -0,0 +1,32 @@ +rules_version = '2'; +service cloud.firestore { + match /databases/{database}/documents { + // match /questions/{document=**} { + // allow read: if request.auth != null + // } + + // match /answers/{userId}/profiles/default { + // allow read, write: if request.auth != null && request.auth.uid == userId + // } + + match /credits/{userId}/{document=**} { + allow read: if request.auth != null && request.auth.uid == userId + } + + match /users/{userId}/prompts/{document=**} { + allow read: if request.auth != null && request.auth.uid == userId + } + + // match /users/{userId}/profiles/{document=**} { + // allow read: if request.auth != null && request.auth.uid == userId + // } + + match /users/{userId}/creditHistory/{document=**} { + allow read: if request.auth != null && request.auth.uid == userId + } + + match /{document=**} { + allow read, write: if false; + } + } +} diff --git a/backend/functions/.editorconfig b/backend/functions/.editorconfig new file mode 100644 index 0000000..17d2fbb --- /dev/null +++ b/backend/functions/.editorconfig @@ -0,0 +1,36 @@ +root = true + +[*] +end_of_line = lf +charset = utf-8 +indent_style = space +insert_final_newline = true +trim_trailing_whitespace = true +indent_size = 4 +quote_type = single +max_line_length = 120 + +[*.py] +indent_size = 4 + +[*.ts] +indent_size = 4 + +[*.js] +indent_size = 2 + +[*.vue] +indent_size = 2 + +[*.*sx] +indent_size = 2 + +[*.*ml] +indent_size = 2 + +[*.json] +indent_size = 2 + +[*.md] +indent_size = 2 +trim_trailing_whitespace = false diff --git a/backend/functions/.env.example b/backend/functions/.env.example new file mode 100644 index 0000000..e69de29 diff --git a/backend/functions/.vscode/launch.json b/backend/functions/.vscode/launch.json new file mode 100644 index 0000000..5ed95d8 --- /dev/null +++ b/backend/functions/.vscode/launch.json @@ -0,0 +1,27 @@ +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Attach by Process ID", + "processId": "${command:PickProcess}", + "request": "attach", + "skipFiles": [ + "/**" + ], + "type": "node" + }, + { + "name": "Attach", + "port": 9229, + "request": "attach", + "skipFiles": [ + "/**" + ], + "type": "node" + } + ] + } + \ No newline at end of file diff --git a/backend/functions/.vscode/settings.json b/backend/functions/.vscode/settings.json new file mode 100644 index 0000000..3ceefd4 --- /dev/null +++ b/backend/functions/.vscode/settings.json @@ -0,0 +1,10 @@ +{ + "cSpell.words": [ + "AIHTTP", + "Castable", + "civkit", + "Firestore", + "openai" + ], + "typescript.tsdk": "node_modules/typescript/lib" +} diff --git a/backend/functions/firebase-export-1712748362961bSfwZx/firestore_export/firestore_export.overall_export_metadata b/backend/functions/firebase-export-1712748362961bSfwZx/firestore_export/firestore_export.overall_export_metadata new file mode 100644 index 0000000..92b92aa Binary files /dev/null and b/backend/functions/firebase-export-1712748362961bSfwZx/firestore_export/firestore_export.overall_export_metadata differ diff --git a/backend/functions/package.json b/backend/functions/package.json new file mode 100644 index 0000000..d989403 --- /dev/null +++ b/backend/functions/package.json @@ -0,0 +1,72 @@ +{ + "name": "url2text", + "scripts": { + "lint": "eslint --ext .js,.ts .", + "build": "tsc -p .", + "build:watch": "tsc --watch", + "build:clean": "rm -rf ./build", + "shell": "npm run build && firebase functions:shell", + "emu:stage": "cd .. && tar -czvf firebase-emu-preset.tgz .firebase-emu", + "emu:reset": "rm -rf ../.firebase-emu && tar -xzf ../firebase-emu-preset.tgz --directory ../", + "emu:start": "firebase emulators:start --import ../.firebase-emu --export-on-exit", + "emu:debug": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions", + "emu:debug2": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions", + "emu:kill": "killall java", + "serve": "npm run build && npm run emu:start", + "debug": "npm run build && npm run emu:start -- --inspect-functions", + "from-scratch": "npm run build && rm -rf ../.firebase-emu && firebase emulators:start --export-on-exit", + "from-preset": "npm run build && npm run emu:reset && npm run emu:start", + "start": "npm run shell", + "deploy": "firebase deploy --only functions", + "logs": "firebase functions:log" + }, + "engines": { + "node": "20" + }, + "main": "build/index.js", + "dependencies": { + "@google-cloud/translate": "^8.2.0", + "@mozilla/readability": "^0.5.0", + "@napi-rs/canvas": "^0.1.44", + "@types/turndown": "^5.0.4", + "archiver": "^6.0.1", + "axios": "^1.3.3", + "bcrypt": "^5.1.0", + "civkit": "^0.6.5-be430ac", + "cors": "^2.8.5", + "dayjs": "^1.11.9", + "express": "^4.19.2", + "firebase-admin": "^11.5.0", + "firebase-functions": "^4.8.0", + "generic-pool": "^3.9.0", + "htmlparser2": "^9.0.0", + "jose": "^5.1.0", + "langdetect": "^0.2.1", + "minio": "^7.1.3", + "openai": "^4.20.0", + "puppeteer": "^22.6.3", + "stripe": "^11.11.0", + "tiktoken": "^1.0.10", + "turndown": "^7.1.3", + "undici": "^5.24.0" + }, + "devDependencies": { + "@types/archiver": "^5.3.4", + "@types/bcrypt": "^5.0.0", + "@types/cors": "^2.8.17", + "@types/generic-pool": "^3.8.1", + "@types/node": "^18", + "@typescript-eslint/eslint-plugin": "^5.12.0", + "@typescript-eslint/parser": "^5.12.0", + "eslint": "^8.9.0", + "eslint-config-google": "^0.14.0", + "eslint-plugin-import": "^2.25.4", + "firebase-functions-test": "^3.0.0", + "replicate": "^0.16.1", + "typescript": "^5.1.6" + }, + "private": true, + "exports": { + ".": "./build/index.js" + } +} diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts new file mode 100644 index 0000000..4d366ed --- /dev/null +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -0,0 +1,69 @@ +import { marshalErrorLike, RPCHost, RPCReflection } from 'civkit'; +import { singleton } from 'tsyringe'; +import { CloudHTTPv2, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared'; +import _ from 'lodash'; +import { PuppeteerControl } from '../services/puppeteer'; +import TurnDownService from 'turndown'; + + +@singleton() +export class CrawlerHost extends RPCHost { + logger = this.globalLogger.child({ service: this.constructor.name }); + + turnDownService = new TurnDownService(); + + constructor( + protected globalLogger: Logger, + protected puppeteerControl: PuppeteerControl, + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + + this.emit('ready'); + } + + @CloudHTTPv2({ + exportInGroup: ['crawler'], + httpMethod: ['get', 'post'], + returnType: OutputServerEventStream, + }) + async crawl( + @RPCReflect() rpcReflect: RPCReflection, + @Param('url', { required: true }) url: string + ) { + await this.serviceReady(); + const sseStream = new OutputServerEventStream(); + + rpcReflect.return(sseStream); + + try { + for await (const scrapped of this.puppeteerControl.scrap(url)) { + this.logger.info(`Scrapped: ${scrapped.snapshot}`); + const content = typeof scrapped.snapshot === 'string' ? scrapped.snapshot : (scrapped.snapshot as any)?.content; + if (!content) { + continue; + } + const text = this.turnDownService.turndown(typeof scrapped.snapshot === 'string' ? scrapped.snapshot : (scrapped.snapshot as any)?.content); + sseStream.write({ + event: 'data', + data: text, + }); + } + } catch (err: any) { + this.logger.error(`Failed to crawl ${url}`, { err: marshalErrorLike(err) }); + sseStream.write({ + event: 'error', + data: err, + }); + } + + sseStream.end(); + + return sseStream; + } + + +} diff --git a/backend/functions/src/fetch.d.ts b/backend/functions/src/fetch.d.ts new file mode 100644 index 0000000..2bc68c0 --- /dev/null +++ b/backend/functions/src/fetch.d.ts @@ -0,0 +1,13 @@ +declare global { + export const { + fetch, + FormData, + Headers, + Request, + Response, + File, + }: typeof import('undici'); + export type { FormData, Headers, Request, RequestInit, Response, RequestInit, File } from 'undici'; +} + +export { }; diff --git a/backend/functions/src/index.ts b/backend/functions/src/index.ts new file mode 100644 index 0000000..fdabab0 --- /dev/null +++ b/backend/functions/src/index.ts @@ -0,0 +1,33 @@ +import 'reflect-metadata'; +import * as functions from 'firebase-functions'; +import { initializeApp } from 'firebase-admin/app'; +initializeApp(); + +import secretExposer from './shared/services/secrets'; + +export const onUserCreated = functions + .runWith({ secrets: [...secretExposer.bundle], memory: '512MB' }) + .auth.user() + .onCreate(async (user) => { + + return null; + }); + +export const onUserLogin = functions + .runWith({ secrets: [...secretExposer.bundle], memory: '512MB' }) + .auth.user() + .beforeSignIn(async (user, _ctx) => { + + return; + }); + +import { loadModulesDynamically, registry } from './shared'; +import path from 'path'; +loadModulesDynamically(path.resolve(__dirname, 'cloud-functions')); + +Object.assign(exports, registry.exportGrouped({ + memory: '1GiB', + timeoutSeconds: 540, +})); +registry.title = 'url2text'; +registry.version = '0.1.0'; diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts new file mode 100644 index 0000000..33514ee --- /dev/null +++ b/backend/functions/src/services/puppeteer.ts @@ -0,0 +1,152 @@ +import { AsyncService, Defer } from 'civkit'; +import { container, singleton } from 'tsyringe'; +import puppeteer, { Browser } from 'puppeteer'; +import { Logger } from '../shared/services/logger'; +import genericPool from 'generic-pool'; +import os from 'os'; +import fs from 'fs'; + + +const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); + +@singleton() +export class PuppeteerControl extends AsyncService { + + browser!: Browser; + logger = this.globalLogger.child({ service: this.constructor.name }); + + pagePool = genericPool.createPool({ + create: async () => { + const page = await this.newPage(); + return page; + }, + destroy: async (page) => { + await page.browserContext().close(); + }, + validate: async (page) => { + return this.browser.connected && !page.isClosed(); + } + }, { + max: Math.ceil(os.freemem() / 1024 * 1024 * 1024), + min: 0, + }); + + constructor(protected globalLogger: Logger) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + + if (this.browser) { + await this.browser.close(); + } + this.browser = await puppeteer.launch({ + headless: false, + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + this.browser.once('disconnected', () => { + this.logger.warn(`Browser disconnected`); + this.emit('crippled'); + }); + + this.emit('ready'); + } + + async newPage() { + await this.serviceReady(); + const dedicatedContext = await this.browser.createBrowserContext(); + + const page = await dedicatedContext.newPage(); + await page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`); + await page.setViewport({ width: 1920, height: 1080 }); + await page.exposeFunction('reportSnapshot', (snapshot: any) => { + page.emit('snapshot', snapshot); + }); + + await page.evaluateOnNewDocument(READABILITY_JS); + + await page.evaluateOnNewDocument(() => { + // @ts-expect-error + window.giveSnapshot() = () => { + // @ts-expect-error + return new Readability(document.cloneNode(true)).parse(); + }; + let aftershot: any; + const handlePageLoad = () => { + // @ts-expect-error + if (document.readyState !== 'complete' && document.readyState !== 'interactive') { + return; + } + + // @ts-expect-error + const parsed = window.giveSnapshot(); + console.log(parsed); + if (parsed) { + // @ts-expect-error + window.reportSnapshot(parsed); + } else { + if (aftershot) { + clearTimeout(aftershot); + } + aftershot = setTimeout(() => { + // @ts-expect-error + window.reportSnapshot(window.giveSnapshot()); + }, 500); + } + }; + // setInterval(handlePageLoad, 1000); + // @ts-expect-error + document.addEventListener('readystatechange', handlePageLoad); + // @ts-expect-error + document.addEventListener('load', handlePageLoad); + }); + + // TODO: further setup the page; + + return page; + } + + async *scrap(url: string) { + const page = await this.pagePool.acquire(); + let snapshot: unknown; + let nextSnapshotDeferred = Defer(); + let finalized = false; + const hdl = (s: any) => { + if (snapshot === s) { + return; + } + snapshot = s; + nextSnapshotDeferred.resolve(s); + nextSnapshotDeferred = Defer(); + }; + page.on('snapshot', hdl); + const gotoPromise = page.goto(url, { waitUntil: 'networkidle2', timeout: 30_000 }); + gotoPromise.finally(() => finalized = true); + + try { + while (true) { + await Promise.race([nextSnapshotDeferred.promise, gotoPromise]); + const screenshot = await page.screenshot(); + if (finalized) { + await gotoPromise; + snapshot = await page.evaluate('window.giveSnapshot()'); + yield { snapshot, screenshot }; + break; + } + yield { snapshot, screenshot }; + } + } catch (_err) { + void 0; + } finally { + page.off('snapshot', hdl); + await this.pagePool.destroy(page); + } + + } + +} + +const puppeteerControl = container.resolve(PuppeteerControl); + +export default puppeteerControl; diff --git a/backend/functions/src/shared b/backend/functions/src/shared new file mode 120000 index 0000000..c8c3836 --- /dev/null +++ b/backend/functions/src/shared @@ -0,0 +1 @@ +../../../thinapps-shared/backend \ No newline at end of file diff --git a/backend/functions/src/types.d.ts b/backend/functions/src/types.d.ts new file mode 100644 index 0000000..e31c720 --- /dev/null +++ b/backend/functions/src/types.d.ts @@ -0,0 +1,9 @@ +declare module 'langdetect' { + interface DetectionResult { + lang: string; + prob: number; + } + + export function detect(text: string): DetectionResult[]; + export function detectOne(text: string): string | null; +} diff --git a/backend/functions/tsconfig.json b/backend/functions/tsconfig.json new file mode 100644 index 0000000..05915ea --- /dev/null +++ b/backend/functions/tsconfig.json @@ -0,0 +1,21 @@ +{ + "compilerOptions": { + "module": "commonjs", + "noImplicitReturns": true, + "noUnusedLocals": true, + "outDir": "build", + "sourceMap": true, + "strict": true, + "allowJs": true, + "target": "es2022", + "lib": ["es2022"], + "skipLibCheck": true, + "useDefineForClassFields": false, + "experimentalDecorators": true, + "emitDecoratorMetadata": true, + "esModuleInterop": true, + "noImplicitOverride": true, + }, + "compileOnSave": true, + "include": ["src"] +} diff --git a/backend/storage.rules b/backend/storage.rules new file mode 100644 index 0000000..9f33d22 --- /dev/null +++ b/backend/storage.rules @@ -0,0 +1,8 @@ +rules_version = '2'; +service firebase.storage { + match /b/{bucket}/o { + match /{allPaths=**} { + allow read, write: if false; + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..e016ad9 --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "name": "url2text", + "version": "1.0.0", + "description": "### Prerequisite - Node v18 (The build fails for Node version >18) - Yarn - Firebase CLI (`npm install -g firebase-tools`)", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "", + "license": "ISC", + "devDependencies": { + "firebase-tools": "^12.4.2", + "typescript": "^5.1.6" + } +} \ No newline at end of file diff --git a/thinapps-shared b/thinapps-shared new file mode 160000 index 0000000..9f0fa1d --- /dev/null +++ b/thinapps-shared @@ -0,0 +1 @@ +Subproject commit 9f0fa1dd7f8cfcea4c8d79252319b151fae6ed19