This commit is contained in:
yanlong.wang 2024-04-10 19:32:07 +08:00
parent 8b9ecf2e60
commit 89d6d49f06
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
29 changed files with 1090 additions and 130 deletions

0
.github/workflows/.keep vendored Normal file
View File

131
.gitignore vendored
View File

@ -1,130 +1,3 @@
# Logs package-lock.json
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/ node_modules/
jspm_packages/ .DS_Store
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
.cache
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "thinapps-shared"]
path = thinapps-shared
url = git@github.com:jina-ai/thinapps-shared.git

10
.vscode/exensions.json vendored Normal file
View File

@ -0,0 +1,10 @@
{
"recommendations": [
"editorconfig.editorconfig",
"octref.vetur",
"redhat.vscode-yaml",
"dbaeumer.vscode-eslint",
"esbenp.prettier-vscode",
"streetsidesoftware.code-spell-checker"
]
}

60
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,60 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Debug Fullstack: attach",
"request": "attach",
"cwd": "${workspaceFolder}/backend/functions",
"skipFiles": [
"<node_internals>/**"
],
"type": "node",
"preLaunchTask": "Fullstack:debug"
},
{
"name": "Debug Fullstack: attach: with proxy",
"request": "attach",
"cwd": "${workspaceFolder}/backend/functions",
"skipFiles": [
"<node_internals>/**"
],
"type": "node",
"preLaunchTask": "Fullstack:debug:with-proxy"
},
{
"name": "Attach",
"port": 9229,
"request": "attach",
"skipFiles": [
"<node_internals>/**"
],
"type": "node"
},
{
"name": "Attach by Process ID",
"processId": "${command:PickProcess}",
"request": "attach",
"skipFiles": [
"<node_internals>/**"
],
"type": "node"
},
{
"name": "Debug Fullstack",
"request": "launch",
"runtimeArgs": [
"emulators:start",
"--import=../.firebase-emu",
"--export-on-exit=../.firebase-emu",
],
"cwd": "${workspaceFolder}/backend/functions",
"runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase",
"skipFiles": [
"<node_internals>/**"
],
"type": "node",
"preLaunchTask": "Fullstack:prepare",
"killBehavior": "polite"
},
]
}

105
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,105 @@
{
"editor.wordWrap": "on",
"editor.wordWrapColumn": 120,
"files.trimTrailingWhitespace": true,
"files.trimFinalNewlines": true,
"[javascript]": {
"editor.defaultFormatter": "vscode.typescript-language-features"
},
"[vue]": {
"editor.defaultFormatter": "Vue.volar"
},
"[jsonc]": {
"editor.defaultFormatter": "vscode.json-language-features"
},
"[typescript]": {
"editor.defaultFormatter": "vscode.typescript-language-features"
},
"[json]": {
"editor.defaultFormatter": "vscode.json-language-features"
},
"[yaml]": {
"editor.defaultFormatter": "redhat.vscode-yaml"
},
"[markdown]": {
"files.trimTrailingWhitespace": false
},
"typescript.tsdk": "node_modules/typescript/lib",
"vetur.format.defaultFormatter.ts": "vscode-typescript",
"vetur.format.defaultFormatter.js": "vscode-typescript",
"typescript.preferences.quoteStyle": "single",
"typescript.format.semicolons": "insert",
"typescript.preferences.importModuleSpecifier": "project-relative",
"typescript.locale": "en",
"cSpell.enabled": true,
"cSpell.words": [
"Apiextensions",
"apihubble",
"auths",
"AUTOCASTABLE",
"Autocasting",
"backchannel",
"bodyparser",
"bson",
"BUILDKIT",
"buildx",
"castable",
"cmdl",
"Commandline",
"conpty",
"cpid",
"deferreds",
"DEVBOT",
"dockerhub",
"entrypoint",
"ENVIROMENT",
"finetuner",
"fpath",
"fswalk",
"Grafana",
"Hasher",
"istio",
"jina",
"jinahub",
"jinameta",
"Knative",
"kourier",
"kube",
"kubectl",
"Kubernetes",
"kwargs",
"letsencrypt",
"liveconfigs",
"LOGNAME",
"metas",
"Mgmt",
"middlewares",
"minikube",
"minio",
"ndjson",
"nodelib",
"oidc",
"openapi",
"paramtypes",
"penv",
"pino",
"prebuild",
"quickstart",
"reinit",
"sslip",
"subval",
"Succ",
"timedout",
"TOTP",
"tsbuildinfo",
"tsyringe",
"typeclass",
"upsert",
"upserted",
"userinfo",
"Vecs",
"vectorize",
"WECHAT",
"WXPAY"
],
}

156
.vscode/tasks.json vendored Normal file
View File

@ -0,0 +1,156 @@
{
"version": "2.0.0",
"tasks": [
{
"type": "npm",
"script": "build",
"group": "build",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
},
"problemMatcher": [],
"label": "Backend:rebuild",
"detail": "Backend:rebuild"
},
{
"type": "npm",
"script": "emu:reset",
"group": "build",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
},
"problemMatcher": [],
"label": "Backend:reset-emulator",
"detail": "Backend:reset-emulator"
},
{
"type": "typescript",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
},
"tsconfig": "backend/functions/tsconfig.json",
"option": "watch",
"isBackground": true,
"problemMatcher": [
"$tsc-watch"
],
"group": "build",
"label": "Backend:build:watch"
},
{
"type": "npm",
"script": "emu:debug",
"group": "none",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
},
"problemMatcher": [
{
"base": "$tsc",
"background": {
"activeOnStart": false,
"beginsPattern": "shutdown requested|Starting emulators",
"endsPattern": "Debugger listening"
}
}
],
"label": "Backend:start-emulator-debug",
"detail": "Backend:start-emulator-debug",
"dependsOn": [
"Backend:build:watch"
],
"isBackground": true,
},
{
"type": "npm",
"script": "dev",
"options": {
"cwd": "${workspaceFolder}/webapp",
},
"group": "build",
"label": "Frontend:start:dev",
"detail": "Frontend:start:dev",
"isBackground": true,
"problemMatcher": {
"base": "$vite",
"background": {
"activeOnStart": true,
"endsPattern": "OK",
"beginsPattern": "vite"
}
},
},
{
"type": "npm",
"script": "dev",
"options": {
"cwd": "${workspaceFolder}/webapp",
"env": {
"FIREBASE_EMULATE": "true",
}
},
"group": "build",
"label": "Frontend:start:emu",
"detail": "Frontend:start:emu",
"isBackground": true,
"problemMatcher": {
"base": "$vite",
"background": {
"activeOnStart": true,
"endsPattern": "OK",
"beginsPattern": "vite"
}
},
},
{
"type": "npm",
"script": "emu:debug2",
"group": "none",
"options": {
"cwd": "${workspaceFolder}/backend/functions",
"env": {
"https_proxy": "http://127.0.0.1:7890",
"http_proxy": "http://127.0.0.1:7890",
"all_proxy": "socks5://127.0.0.1:7890"
}
},
"problemMatcher": [
{
"base": "$tsc",
"background": {
"activeOnStart": false,
"beginsPattern": "shutdown requested|Starting emulators",
"endsPattern": "Debugger listening"
}
}
],
"label": "Backend:start-emulator-debug:with-proxy",
"detail": "Backend:start-emulator-debug:with-proxy",
"dependsOn": [
"Backend:build:watch"
],
"isBackground": true,
},
{
"label": "Fullstack:prepare",
"dependsOn": [
"Frontend:start:emu",
"Backend:build:watch",
],
},
{
"label": "Fullstack:debug",
"dependsOn": [
// "Frontend:start:emu",
"Backend:start-emulator-debug",
],
},
{
"label": "Fullstack:debug:with-proxy",
"dependsOn": [
"Frontend:start:emu",
"Backend:start-emulator-debug:with-proxy",
],
}
]
}

113
README.md
View File

@ -1 +1,112 @@
# url2text # Url2Text
## Development Guide
### Prerequisite
- Node v18 (The build fails for Node version >18)
- Yarn
- Firebase CLI (`npm install -g firebase-tools`)
### Installation
Clone the scenex repo by running the command:
```bash
git clone git@github.com:jina-ai/url2text.git
git submodule init
git submodule update
```
After a successful clone, install the packages for backend and the webapp.
For backend, go to the `backend/functions` directory and install the npm dependencies.
```bash
cd backend/functions
npm install
```
For the frontend (webapp), go to the `webapp` directory and install the yarn dependencies.
```bash
cd webapp
yarn
```
### Configure
**Establish localhost connection:**
Once the packages are installed, go to the `App.vue` file inside the `webapp/src/` and uncomment the below code:
```js
connectFunctionsEmulator(functions, 'localhost', 5001);
```
### Run The Application Now
To run the backend server, inside the `backend/functions` dir run the below command:
```bash
npm run serve
```
To run the frontend app, inside the `webapp` dir run the below command:
```bash
yarn dev
```
### Known Errors
1. If you encounter 'npm ERR! /bin/sh: pkg-config: command not found' error in Mac, run the command `brew install pkg-config cairo libpng jpeg giflib pango librsvg`
## Best practices
### Directory structure
There are three folders:
1. `webapp` is the frontend project of `SceneX`, knowledge requirements:
- Vue 3
- Quasar
- ...
2. `backend` contains source code of backend logic, knowledge requirements:
- Nodejs
- Firebase
- ...
3. `scripts` folder includes custom scripts we might need during the development or for production, currently we have the following scripts:
- `translate` is responsible for translating and updating our i18n language files in frontend project.
### Best practices of frontend
1. **Quasar docs** is your `best friend`. Since the frontend project highly depends on framework `Quasar`. It is recommended to use the predefined classes and components and avoid defining your custom classes
2. **Double check** of the UI output in `Dark mode` and `Light mode`. Again, use predefined classes and props.
3. **Plugins in boot** folder: create corresponding file in `boot` folder and use them in `quasar.config.js`:
```js
module.exports = configure(function() {
return {
...
boot: [
'i18n',
'axios',
'firebase',
'addressbar-color',
'quasar-lang-pack'
],
...
}
})
```
### Best practices of backend
1. **Remember to deploy your functions** by running:
```bash
# deploy all functions
firebase deploy --only functions
# deploy a specific function
firebase deploy --only functions:{function name}
```

5
backend/.firebaserc Normal file
View File

@ -0,0 +1,5 @@
{
"projects": {
"default": "reader-6b7dc"
}
}

75
backend/.gitignore vendored Normal file
View File

@ -0,0 +1,75 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
firebase-debug.log*
firebase-debug.*.log*
# Firebase cache
.firebase/
# Firebase config
# Uncomment this if you'd like others to create their own Firebase project.
# For a team working on the same Firebase project(s), it is recommended to leave
# it commented so all members can deploy to the same project(s) in .firebaserc.
# .firebaserc
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
# nyc test coverage
.nyc_output
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (http://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.secret.local
toy*.ts
.DS_Store
build/
.firebase-emu/
*.log
.DS_Store

44
backend/firebase.json Normal file
View File

@ -0,0 +1,44 @@
{
"firestore": {
"rules": "firestore.rules",
"indexes": "firestore.indexes.json"
},
"functions": [
{
"source": "functions",
"codebase": "default",
"ignore": [
"node_modules",
"src",
".git",
"firebase-debug.log",
"firebase-debug.*.log"
],
"predeploy": [
"npm --prefix \"$RESOURCE_DIR\" run build:clean",
"npm --prefix \"$RESOURCE_DIR\" run build"
]
}
],
"storage": {
"rules": "storage.rules"
},
"emulators": {
"ui": {
"enabled": true
},
"singleProjectMode": true,
"functions": {
"port": 5001
},
"auth": {
"port": 9099
},
"firestore": {
"port": 9098
},
"storage": {
"port": 9097
}
}
}

View File

@ -0,0 +1,19 @@
{
"indexes": [
{
"collectionGroup": "prompts",
"queryScope": "COLLECTION_GROUP",
"fields": [
{
"fieldPath": "id",
"order": "ASCENDING"
},
{
"fieldPath": "isPublic",
"order": "ASCENDING"
}
]
}
],
"fieldOverrides": []
}

32
backend/firestore.rules Normal file
View File

@ -0,0 +1,32 @@
rules_version = '2';
service cloud.firestore {
match /databases/{database}/documents {
// match /questions/{document=**} {
// allow read: if request.auth != null
// }
// match /answers/{userId}/profiles/default {
// allow read, write: if request.auth != null && request.auth.uid == userId
// }
match /credits/{userId}/{document=**} {
allow read: if request.auth != null && request.auth.uid == userId
}
match /users/{userId}/prompts/{document=**} {
allow read: if request.auth != null && request.auth.uid == userId
}
// match /users/{userId}/profiles/{document=**} {
// allow read: if request.auth != null && request.auth.uid == userId
// }
match /users/{userId}/creditHistory/{document=**} {
allow read: if request.auth != null && request.auth.uid == userId
}
match /{document=**} {
allow read, write: if false;
}
}
}

View File

@ -0,0 +1,36 @@
root = true
[*]
end_of_line = lf
charset = utf-8
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true
indent_size = 4
quote_type = single
max_line_length = 120
[*.py]
indent_size = 4
[*.ts]
indent_size = 4
[*.js]
indent_size = 2
[*.vue]
indent_size = 2
[*.*sx]
indent_size = 2
[*.*ml]
indent_size = 2
[*.json]
indent_size = 2
[*.md]
indent_size = 2
trim_trailing_whitespace = false

View File

27
backend/functions/.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,27 @@
{
// 使 IntelliSense
//
// 访: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Attach by Process ID",
"processId": "${command:PickProcess}",
"request": "attach",
"skipFiles": [
"<node_internals>/**"
],
"type": "node"
},
{
"name": "Attach",
"port": 9229,
"request": "attach",
"skipFiles": [
"<node_internals>/**"
],
"type": "node"
}
]
}

10
backend/functions/.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,10 @@
{
"cSpell.words": [
"AIHTTP",
"Castable",
"civkit",
"Firestore",
"openai"
],
"typescript.tsdk": "node_modules/typescript/lib"
}

View File

@ -0,0 +1,72 @@
{
"name": "url2text",
"scripts": {
"lint": "eslint --ext .js,.ts .",
"build": "tsc -p .",
"build:watch": "tsc --watch",
"build:clean": "rm -rf ./build",
"shell": "npm run build && firebase functions:shell",
"emu:stage": "cd .. && tar -czvf firebase-emu-preset.tgz .firebase-emu",
"emu:reset": "rm -rf ../.firebase-emu && tar -xzf ../firebase-emu-preset.tgz --directory ../",
"emu:start": "firebase emulators:start --import ../.firebase-emu --export-on-exit",
"emu:debug": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions",
"emu:debug2": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions",
"emu:kill": "killall java",
"serve": "npm run build && npm run emu:start",
"debug": "npm run build && npm run emu:start -- --inspect-functions",
"from-scratch": "npm run build && rm -rf ../.firebase-emu && firebase emulators:start --export-on-exit",
"from-preset": "npm run build && npm run emu:reset && npm run emu:start",
"start": "npm run shell",
"deploy": "firebase deploy --only functions",
"logs": "firebase functions:log"
},
"engines": {
"node": "20"
},
"main": "build/index.js",
"dependencies": {
"@google-cloud/translate": "^8.2.0",
"@mozilla/readability": "^0.5.0",
"@napi-rs/canvas": "^0.1.44",
"@types/turndown": "^5.0.4",
"archiver": "^6.0.1",
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"civkit": "^0.6.5-be430ac",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
"express": "^4.19.2",
"firebase-admin": "^11.5.0",
"firebase-functions": "^4.8.0",
"generic-pool": "^3.9.0",
"htmlparser2": "^9.0.0",
"jose": "^5.1.0",
"langdetect": "^0.2.1",
"minio": "^7.1.3",
"openai": "^4.20.0",
"puppeteer": "^22.6.3",
"stripe": "^11.11.0",
"tiktoken": "^1.0.10",
"turndown": "^7.1.3",
"undici": "^5.24.0"
},
"devDependencies": {
"@types/archiver": "^5.3.4",
"@types/bcrypt": "^5.0.0",
"@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1",
"@types/node": "^18",
"@typescript-eslint/eslint-plugin": "^5.12.0",
"@typescript-eslint/parser": "^5.12.0",
"eslint": "^8.9.0",
"eslint-config-google": "^0.14.0",
"eslint-plugin-import": "^2.25.4",
"firebase-functions-test": "^3.0.0",
"replicate": "^0.16.1",
"typescript": "^5.1.6"
},
"private": true,
"exports": {
".": "./build/index.js"
}
}

View File

@ -0,0 +1,69 @@
import { marshalErrorLike, RPCHost, RPCReflection } from 'civkit';
import { singleton } from 'tsyringe';
import { CloudHTTPv2, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
import _ from 'lodash';
import { PuppeteerControl } from '../services/puppeteer';
import TurnDownService from 'turndown';
@singleton()
export class CrawlerHost extends RPCHost {
logger = this.globalLogger.child({ service: this.constructor.name });
turnDownService = new TurnDownService();
constructor(
protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
@CloudHTTPv2({
exportInGroup: ['crawler'],
httpMethod: ['get', 'post'],
returnType: OutputServerEventStream,
})
async crawl(
@RPCReflect() rpcReflect: RPCReflection,
@Param('url', { required: true }) url: string
) {
await this.serviceReady();
const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream);
try {
for await (const scrapped of this.puppeteerControl.scrap(url)) {
this.logger.info(`Scrapped: ${scrapped.snapshot}`);
const content = typeof scrapped.snapshot === 'string' ? scrapped.snapshot : (scrapped.snapshot as any)?.content;
if (!content) {
continue;
}
const text = this.turnDownService.turndown(typeof scrapped.snapshot === 'string' ? scrapped.snapshot : (scrapped.snapshot as any)?.content);
sseStream.write({
event: 'data',
data: text,
});
}
} catch (err: any) {
this.logger.error(`Failed to crawl ${url}`, { err: marshalErrorLike(err) });
sseStream.write({
event: 'error',
data: err,
});
}
sseStream.end();
return sseStream;
}
}

13
backend/functions/src/fetch.d.ts vendored Normal file
View File

@ -0,0 +1,13 @@
declare global {
export const {
fetch,
FormData,
Headers,
Request,
Response,
File,
}: typeof import('undici');
export type { FormData, Headers, Request, RequestInit, Response, RequestInit, File } from 'undici';
}
export { };

View File

@ -0,0 +1,33 @@
import 'reflect-metadata';
import * as functions from 'firebase-functions';
import { initializeApp } from 'firebase-admin/app';
initializeApp();
import secretExposer from './shared/services/secrets';
export const onUserCreated = functions
.runWith({ secrets: [...secretExposer.bundle], memory: '512MB' })
.auth.user()
.onCreate(async (user) => {
return null;
});
export const onUserLogin = functions
.runWith({ secrets: [...secretExposer.bundle], memory: '512MB' })
.auth.user()
.beforeSignIn(async (user, _ctx) => {
return;
});
import { loadModulesDynamically, registry } from './shared';
import path from 'path';
loadModulesDynamically(path.resolve(__dirname, 'cloud-functions'));
Object.assign(exports, registry.exportGrouped({
memory: '1GiB',
timeoutSeconds: 540,
}));
registry.title = 'url2text';
registry.version = '0.1.0';

View File

@ -0,0 +1,152 @@
import { AsyncService, Defer } from 'civkit';
import { container, singleton } from 'tsyringe';
import puppeteer, { Browser } from 'puppeteer';
import { Logger } from '../shared/services/logger';
import genericPool from 'generic-pool';
import os from 'os';
import fs from 'fs';
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@singleton()
export class PuppeteerControl extends AsyncService {
browser!: Browser;
logger = this.globalLogger.child({ service: this.constructor.name });
pagePool = genericPool.createPool({
create: async () => {
const page = await this.newPage();
return page;
},
destroy: async (page) => {
await page.browserContext().close();
},
validate: async (page) => {
return this.browser.connected && !page.isClosed();
}
}, {
max: Math.ceil(os.freemem() / 1024 * 1024 * 1024),
min: 0,
});
constructor(protected globalLogger: Logger) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
if (this.browser) {
await this.browser.close();
}
this.browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
this.browser.once('disconnected', () => {
this.logger.warn(`Browser disconnected`);
this.emit('crippled');
});
this.emit('ready');
}
async newPage() {
await this.serviceReady();
const dedicatedContext = await this.browser.createBrowserContext();
const page = await dedicatedContext.newPage();
await page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`);
await page.setViewport({ width: 1920, height: 1080 });
await page.exposeFunction('reportSnapshot', (snapshot: any) => {
page.emit('snapshot', snapshot);
});
await page.evaluateOnNewDocument(READABILITY_JS);
await page.evaluateOnNewDocument(() => {
// @ts-expect-error
window.giveSnapshot() = () => {
// @ts-expect-error
return new Readability(document.cloneNode(true)).parse();
};
let aftershot: any;
const handlePageLoad = () => {
// @ts-expect-error
if (document.readyState !== 'complete' && document.readyState !== 'interactive') {
return;
}
// @ts-expect-error
const parsed = window.giveSnapshot();
console.log(parsed);
if (parsed) {
// @ts-expect-error
window.reportSnapshot(parsed);
} else {
if (aftershot) {
clearTimeout(aftershot);
}
aftershot = setTimeout(() => {
// @ts-expect-error
window.reportSnapshot(window.giveSnapshot());
}, 500);
}
};
// setInterval(handlePageLoad, 1000);
// @ts-expect-error
document.addEventListener('readystatechange', handlePageLoad);
// @ts-expect-error
document.addEventListener('load', handlePageLoad);
});
// TODO: further setup the page;
return page;
}
async *scrap(url: string) {
const page = await this.pagePool.acquire();
let snapshot: unknown;
let nextSnapshotDeferred = Defer();
let finalized = false;
const hdl = (s: any) => {
if (snapshot === s) {
return;
}
snapshot = s;
nextSnapshotDeferred.resolve(s);
nextSnapshotDeferred = Defer();
};
page.on('snapshot', hdl);
const gotoPromise = page.goto(url, { waitUntil: 'networkidle2', timeout: 30_000 });
gotoPromise.finally(() => finalized = true);
try {
while (true) {
await Promise.race([nextSnapshotDeferred.promise, gotoPromise]);
const screenshot = await page.screenshot();
if (finalized) {
await gotoPromise;
snapshot = await page.evaluate('window.giveSnapshot()');
yield { snapshot, screenshot };
break;
}
yield { snapshot, screenshot };
}
} catch (_err) {
void 0;
} finally {
page.off('snapshot', hdl);
await this.pagePool.destroy(page);
}
}
}
const puppeteerControl = container.resolve(PuppeteerControl);
export default puppeteerControl;

View File

@ -0,0 +1 @@
../../../thinapps-shared/backend

9
backend/functions/src/types.d.ts vendored Normal file
View File

@ -0,0 +1,9 @@
declare module 'langdetect' {
interface DetectionResult {
lang: string;
prob: number;
}
export function detect(text: string): DetectionResult[];
export function detectOne(text: string): string | null;
}

View File

@ -0,0 +1,21 @@
{
"compilerOptions": {
"module": "commonjs",
"noImplicitReturns": true,
"noUnusedLocals": true,
"outDir": "build",
"sourceMap": true,
"strict": true,
"allowJs": true,
"target": "es2022",
"lib": ["es2022"],
"skipLibCheck": true,
"useDefineForClassFields": false,
"experimentalDecorators": true,
"emitDecoratorMetadata": true,
"esModuleInterop": true,
"noImplicitOverride": true,
},
"compileOnSave": true,
"include": ["src"]
}

8
backend/storage.rules Normal file
View File

@ -0,0 +1,8 @@
rules_version = '2';
service firebase.storage {
match /b/{bucket}/o {
match /{allPaths=**} {
allow read, write: if false;
}
}
}

15
package.json Normal file
View File

@ -0,0 +1,15 @@
{
"name": "url2text",
"version": "1.0.0",
"description": "### Prerequisite - Node v18 (The build fails for Node version >18) - Yarn - Firebase CLI (`npm install -g firebase-tools`)",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"devDependencies": {
"firebase-tools": "^12.4.2",
"typescript": "^5.1.6"
}
}

1
thinapps-shared Submodule

@ -0,0 +1 @@
Subproject commit 9f0fa1dd7f8cfcea4c8d79252319b151fae6ed19