From b3fb4c5c57e2345b710b60dfe5f51262eb70be85 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Mon, 15 Apr 2024 20:51:31 -0700 Subject: [PATCH] feat: add image captioning (#6) * Fix contentText assignment in CrawlerHost class * fix: recover vscode configurations * feat: add image captioning * feat: add image captioning * clean: vscode config * chore: fix some ts warnings * feat: auto alt text * fix * chore: improve prompt * clean: unused config * fix: failure condition * fix: remove redundant code * fix: catch parse error * fix: catch parse error --------- Co-authored-by: Yanlong Wang --- .gitignore | 4 +- .vscode/exensions.json | 10 ++ .vscode/launch.json | 60 +++++++ .vscode/settings.json | 32 ++++ .vscode/tasks.json | 156 ++++++++++++++++++ backend/functions/.puppeteerrc.cjs | 13 +- .../functions/src/cloud-functions/crawler.ts | 75 +++++++-- backend/functions/src/db/img-alt.ts | 42 +++++ backend/functions/src/services/alt-text.ts | 91 ++++++++++ backend/functions/src/services/puppeteer.ts | 54 ++++-- thinapps-shared | 2 +- 11 files changed, 502 insertions(+), 37 deletions(-) create mode 100644 .vscode/exensions.json create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 .vscode/tasks.json create mode 100644 backend/functions/src/db/img-alt.ts create mode 100644 backend/functions/src/services/alt-text.ts diff --git a/.gitignore b/.gitignore index 588ce23..12ac647 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,2 @@ node_modules/ -.DS_Store -.vscode -.cache \ No newline at end of file +.DS_Store \ No newline at end of file diff --git a/.vscode/exensions.json b/.vscode/exensions.json new file mode 100644 index 0000000..37bdb37 --- /dev/null +++ b/.vscode/exensions.json @@ -0,0 +1,10 @@ +{ + "recommendations": [ + "editorconfig.editorconfig", + "octref.vetur", + "redhat.vscode-yaml", + "dbaeumer.vscode-eslint", + "esbenp.prettier-vscode", + "streetsidesoftware.code-spell-checker" + ] +} \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..c7cab1b --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,60 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Debug Fullstack: attach", + "request": "attach", + "cwd": "${workspaceFolder}/backend/functions", + "skipFiles": [ + "/**" + ], + "type": "node", + "preLaunchTask": "Fullstack:debug" + }, + { + "name": "Debug Fullstack: attach: with proxy", + "request": "attach", + "cwd": "${workspaceFolder}/backend/functions", + "skipFiles": [ + "/**" + ], + "type": "node", + "preLaunchTask": "Fullstack:debug:with-proxy" + }, + { + "name": "Attach", + "port": 9229, + "request": "attach", + "skipFiles": [ + "/**" + ], + "type": "node" + }, + { + "name": "Attach by Process ID", + "processId": "${command:PickProcess}", + "request": "attach", + "skipFiles": [ + "/**" + ], + "type": "node" + }, + { + "name": "Debug Fullstack", + "request": "launch", + "runtimeArgs": [ + "emulators:start", + "--import=../.firebase-emu", + "--export-on-exit=../.firebase-emu", + ], + "cwd": "${workspaceFolder}/backend/functions", + "runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase", + "skipFiles": [ + "/**" + ], + "type": "node", + "preLaunchTask": "Fullstack:prepare", + "killBehavior": "polite" + }, + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..87ecca7 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,32 @@ +{ + "editor.wordWrap": "on", + "editor.wordWrapColumn": 120, + "files.trimTrailingWhitespace": true, + "files.trimFinalNewlines": true, + "[javascript]": { + "editor.defaultFormatter": "vscode.typescript-language-features" + }, + "[jsonc]": { + "editor.defaultFormatter": "vscode.json-language-features" + }, + "[typescript]": { + "editor.defaultFormatter": "vscode.typescript-language-features" + }, + "[json]": { + "editor.defaultFormatter": "vscode.json-language-features" + }, + "[yaml]": { + "editor.defaultFormatter": "redhat.vscode-yaml" + }, + "[markdown]": { + "files.trimTrailingWhitespace": false + }, + "typescript.tsdk": "node_modules/typescript/lib", + "typescript.preferences.quoteStyle": "single", + "typescript.format.semicolons": "insert", + "typescript.preferences.importModuleSpecifier": "project-relative", + "typescript.locale": "en", + "cSpell.enabled": true, + "cSpell.words": [ + ], +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..fc4489b --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,156 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "npm", + "script": "build", + "group": "build", + "options": { + "cwd": "${workspaceFolder}/backend/functions" + }, + "problemMatcher": [], + "label": "Backend:rebuild", + "detail": "Backend:rebuild" + }, + { + "type": "npm", + "script": "emu:reset", + "group": "build", + "options": { + "cwd": "${workspaceFolder}/backend/functions" + }, + "problemMatcher": [], + "label": "Backend:reset-emulator", + "detail": "Backend:reset-emulator" + }, + { + "type": "typescript", + "options": { + "cwd": "${workspaceFolder}/backend/functions" + }, + "tsconfig": "backend/functions/tsconfig.json", + "option": "watch", + "isBackground": true, + "problemMatcher": [ + "$tsc-watch" + ], + "group": "build", + "label": "Backend:build:watch" + }, + { + "type": "npm", + "script": "emu:debug", + "group": "none", + "options": { + "cwd": "${workspaceFolder}/backend/functions" + }, + "problemMatcher": [ + { + "base": "$tsc", + "background": { + "activeOnStart": false, + "beginsPattern": "shutdown requested|Starting emulators", + "endsPattern": "Debugger listening" + } + } + ], + "label": "Backend:start-emulator-debug", + "detail": "Backend:start-emulator-debug", + "dependsOn": [ + "Backend:build:watch" + ], + "isBackground": true, + }, + { + "type": "npm", + "script": "dev", + "options": { + "cwd": "${workspaceFolder}/webapp", + }, + "group": "build", + "label": "Frontend:start:dev", + "detail": "Frontend:start:dev", + "isBackground": true, + "problemMatcher": { + "base": "$vite", + "background": { + "activeOnStart": true, + "endsPattern": "OK", + "beginsPattern": "vite" + } + }, + }, + { + "type": "npm", + "script": "dev", + "options": { + "cwd": "${workspaceFolder}/webapp", + "env": { + "FIREBASE_EMULATE": "true", + } + }, + "group": "build", + "label": "Frontend:start:emu", + "detail": "Frontend:start:emu", + "isBackground": true, + "problemMatcher": { + "base": "$vite", + "background": { + "activeOnStart": true, + "endsPattern": "OK", + "beginsPattern": "vite" + } + }, + }, + { + "type": "npm", + "script": "emu:debug2", + "group": "none", + "options": { + "cwd": "${workspaceFolder}/backend/functions", + "env": { + "https_proxy": "http://127.0.0.1:7890", + "http_proxy": "http://127.0.0.1:7890", + "all_proxy": "socks5://127.0.0.1:7890" + } + }, + "problemMatcher": [ + { + "base": "$tsc", + "background": { + "activeOnStart": false, + "beginsPattern": "shutdown requested|Starting emulators", + "endsPattern": "Debugger listening" + } + } + ], + "label": "Backend:start-emulator-debug:with-proxy", + "detail": "Backend:start-emulator-debug:with-proxy", + "dependsOn": [ + "Backend:build:watch" + ], + "isBackground": true, + }, + { + "label": "Fullstack:prepare", + "dependsOn": [ + "Frontend:start:emu", + "Backend:build:watch", + ], + }, + { + "label": "Fullstack:debug", + "dependsOn": [ + // "Frontend:start:emu", + "Backend:start-emulator-debug", + ], + }, + { + "label": "Fullstack:debug:with-proxy", + "dependsOn": [ + "Frontend:start:emu", + "Backend:start-emulator-debug:with-proxy", + ], + } + ] +} \ No newline at end of file diff --git a/backend/functions/.puppeteerrc.cjs b/backend/functions/.puppeteerrc.cjs index 36f44b1..574a6bd 100644 --- a/backend/functions/.puppeteerrc.cjs +++ b/backend/functions/.puppeteerrc.cjs @@ -1,14 +1,9 @@ const { join } = require('path'); -let config = {}; -if (!process.env.FUNCTIONS_EMULATOR) { - config = { - // Changes the cache location for Puppeteer. - cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'), - }; -} - /** * @type {import("puppeteer").Configuration} */ -module.exports = config; +module.exports = { + // Changes the cache location for Puppeteer. + cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'), +}; diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index feedab2..b968d7a 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -3,9 +3,10 @@ import { singleton } from 'tsyringe'; import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared'; import _ from 'lodash'; import { PageSnapshot, PuppeteerControl } from '../services/puppeteer'; -import TurnDownService from 'turndown'; import { Request, Response } from 'express'; import normalizeUrl from "@esm2cjs/normalize-url"; +import { AltTextService } from '../services/alt-text'; +import TurndownService from 'turndown'; function tidyMarkdown(markdown: string): string { @@ -50,11 +51,14 @@ function tidyMarkdown(markdown: string): string { export class CrawlerHost extends RPCHost { logger = this.globalLogger.child({ service: this.constructor.name }); - turnDownService = new TurnDownService().use(require('turndown-plugin-gfm').gfm); + turnDownPlugins = [require('turndown-plugin-gfm').gfm]; + + imageShortUrlPrefix?: string; constructor( protected globalLogger: Logger, protected puppeteerControl: PuppeteerControl, + protected altTextService: AltTextService, ) { super(...arguments); } @@ -65,14 +69,57 @@ export class CrawlerHost extends RPCHost { this.emit('ready'); } - formatSnapshot(snapshot: PageSnapshot) { - + async formatSnapshot(snapshot: PageSnapshot) { const toBeTurnedToMd = snapshot.parsed?.content; - const turnedDown = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd).trim() : ''; + let turnDownService = new TurndownService(); + for (const plugin of this.turnDownPlugins) { + turnDownService = turnDownService.use(plugin); + } - const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim(); + let contentText = ''; + if (toBeTurnedToMd) { + const urlToAltMap: { [k: string]: { shortDigest: string, alt?: string; }; } = {}; + const tasks = (snapshot.imgs || []).map(async (x) => { + const r = await this.altTextService.getAltTextAndShortDigest(x).catch((err)=> { + this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) }); + return undefined; + }); + if (r) { + urlToAltMap[x.src.trim()] = r; + } + }); - const cleanText = tidyMarkdown(contentText).trim(); + await Promise.all(tasks); + let imgIdx = 0; + + turnDownService.addRule('img-generated-alt', { + filter: 'img', + replacement: (_content, node) => { + const src = (node.getAttribute('src') || '').trim(); + const alt = cleanAttribute(node.getAttribute('alt')); + if (!src) { + return ''; + } + const mapped = urlToAltMap[src]; + imgIdx++; + if (mapped) { + return `![Image ${imgIdx}: ${mapped.alt || alt}](${this.imageShortUrlPrefix ? `${this.imageShortUrlPrefix}/${mapped.shortDigest}` : src})`; + } + return `![Image ${imgIdx}: ${alt}](${src})`; + } + }); + + contentText = turnDownService.turndown(toBeTurnedToMd).trim(); + } + + if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) { + contentText = turnDownService.turndown(snapshot.html); + } + if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) { + contentText = snapshot.text; + } + + const cleanText = tidyMarkdown(contentText || '').trim(); const formatted = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), @@ -142,7 +189,7 @@ ${this.content} continue; } - const formatted = this.formatSnapshot(scrapped); + const formatted = await this.formatSnapshot(scrapped); if (scrapped.screenshot && screenshotEnabled) { sseStream.write({ @@ -177,7 +224,7 @@ ${this.content} continue; } - const formatted = this.formatSnapshot(scrapped); + const formatted = await this.formatSnapshot(scrapped); return formatted; } @@ -186,7 +233,7 @@ ${this.content} throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`); } - return this.formatSnapshot(lastScrapped); + return await this.formatSnapshot(lastScrapped); } for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { @@ -195,7 +242,7 @@ ${this.content} continue; } - const formatted = this.formatSnapshot(scrapped); + const formatted = await this.formatSnapshot(scrapped); return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); } @@ -204,8 +251,12 @@ ${this.content} throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`); } - return `${this.formatSnapshot(lastScrapped)}`; + return `${await this.formatSnapshot(lastScrapped)}`; } } + +function cleanAttribute(attribute: string) { + return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; +} diff --git a/backend/functions/src/db/img-alt.ts b/backend/functions/src/db/img-alt.ts new file mode 100644 index 0000000..a1221ba --- /dev/null +++ b/backend/functions/src/db/img-alt.ts @@ -0,0 +1,42 @@ +import { Also, Prop } from 'civkit'; +import { FirestoreRecord } from '../shared/lib/firestore'; +import _ from 'lodash'; + +@Also({ + dictOf: Object +}) +export class ImgAlt extends FirestoreRecord { + static override collectionName = 'imgAlts'; + + override _id!: string; + + @Prop({ + required: true + }) + src!: string; + + @Prop({ + required: true + }) + urlDigest!: string; + + @Prop() + width?: number; + + @Prop() + height?: number; + + @Prop() + generatedAlt?: string; + + @Prop() + originalAlt?: string; + + @Prop() + createdAt!: Date; + + @Prop() + expireAt?: Date; + + [k: string]: any; +} diff --git a/backend/functions/src/services/alt-text.ts b/backend/functions/src/services/alt-text.ts new file mode 100644 index 0000000..c3dd5d9 --- /dev/null +++ b/backend/functions/src/services/alt-text.ts @@ -0,0 +1,91 @@ +import { AssertionFailureError, AsyncService, HashManager } from 'civkit'; +import { singleton } from 'tsyringe'; +import { Logger } from '../shared/services/logger'; +import { CanvasService } from '../shared/services/canvas'; +import { ImageInterrogationManager } from '../shared/services/common-iminterrogate'; +import { ImgBrief } from './puppeteer'; +import { ImgAlt } from '../db/img-alt'; + + +const md5Hasher = new HashManager('md5', 'hex'); + +@singleton() +export class AltTextService extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + + constructor( + protected globalLogger: Logger, + protected imageInterrogator: ImageInterrogationManager, + protected canvasService: CanvasService + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + this.emit('ready'); + } + + async caption(url: string) { + try { + const img = await this.canvasService.loadImage(url); + const resized = this.canvasService.fitImageToSquareBox(img, 1024); + const exported = await this.canvasService.canvasToBuffer(resized, 'image/png'); + + const r = await this.imageInterrogator.interrogate('blip2', { + image: exported, + // prompt: `A formal caption in one sentence, concise and in the third person: HTML alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.` + }); + + return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim(); + } catch (err) { + throw new AssertionFailureError({ message: `Could not generate alt text for url ${url}`, cause: err }); + } + } + + async getAltTextAndShortDigest(imgBrief: ImgBrief) { + if (!imgBrief.src) { + return undefined; + } + const digest = md5Hasher.hash(imgBrief.src); + const shortDigest = Buffer.from(digest, 'hex').toString('base64url'); + + const existing = await ImgAlt.fromFirestore(shortDigest); + + if (existing?.generatedAlt) { + return { + shortDigest, + alt: existing.generatedAlt, + }; + } + + let generatedCaption; + + if (!imgBrief.alt) { + try { + generatedCaption = await this.caption(imgBrief.src); + } catch (err) { + this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err }); + } + } + + await ImgAlt.COLLECTION.doc(shortDigest).set( + { + _id: shortDigest, + src: imgBrief.src || '', + width: imgBrief.naturalWidth || 0, + height: imgBrief.naturalHeight || 0, + urlDigest: digest, + originalAlt: imgBrief.alt || '', + generatedAlt: generatedCaption || '', + createdAt: new Date() + }, { merge: true } + ); + + return { + shortDigest, + alt: generatedCaption, + }; + } +} diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index c888a92..3a5bd78 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -7,11 +7,19 @@ import os from 'os'; import fs from 'fs'; import { Crawled } from '../db/crawled'; import puppeteer from 'puppeteer-extra'; -import puppeteerStealth from 'puppeteer-extra-plugin-stealth'; - const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); +export interface ImgBrief { + src: string; + loaded: boolean; + width: number; + height: number; + naturalWidth: number; + naturalHeight: number; + alt?: string; +} + export interface PageSnapshot { title: string; href: string; @@ -30,13 +38,16 @@ export interface PageSnapshot { publishedTime: string; } | null; screenshot?: Buffer; + imgs?: ImgBrief[]; } const md5Hasher = new HashManager('md5', 'hex'); +const puppeteerStealth = require('puppeteer-extra-plugin-stealth'); puppeteer.use(puppeteerStealth()); // const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override'); // puppeteer.use(puppeteerUAOverride({ -// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)` +// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`, +// platform: `Linux`, // })) @singleton() @@ -84,7 +95,7 @@ export class PuppeteerControl extends AsyncService { this.browser = await puppeteer.launch({ headless: true, timeout: 10_000 - }).catch((err) => { + }).catch((err: any) => { this.logger.error(`Unknown firebase issue, just die fast.`, { err }); process.nextTick(() => { this.emit('error', err); @@ -117,23 +128,42 @@ export class PuppeteerControl extends AsyncService { })); preparations.push(page.evaluateOnNewDocument(READABILITY_JS)); preparations.push(page.evaluateOnNewDocument(` +function briefImgs(elem) { + const imageTags = Array.from((elem || document).querySelectorAll('img[src]')); + + return imageTags.map((x)=> ({ + src: x.src, + loaded: x.complete, + width: x.width, + height: x.height, + naturalWidth: x.naturalWidth, + naturalHeight: x.naturalHeight, + alt: x.alt || x.title, + })); +} function giveSnapshot() { - let parsedContent; + let parsed; try { - // Attempt to parse the cloned document - parsedContent = new Readability(document.cloneNode(true)).parse(); - } catch (error) { - // If an error occurs, log it and set parsedContent to undefined - parsedContent = undefined; + parsed = new Readability(document.cloneNode(true)).parse(); + } catch (err) { + void 0; } - return { + const r = { title: document.title, href: document.location.href, html: document.documentElement.outerHTML, text: document.body.innerText, - parsed: parsedContent + parsed: parsed, + imgs: [], }; + if (parsed && parsed.content) { + const elem = document.createElement('div'); + elem.innerHTML = parsed.content; + r.imgs = briefImgs(elem); + } + + return r; } `)); preparations.push(page.evaluateOnNewDocument(() => { diff --git a/thinapps-shared b/thinapps-shared index 9f0fa1d..bea967a 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 9f0fa1dd7f8cfcea4c8d79252319b151fae6ed19 +Subproject commit bea967a371581c1109dc0101dbcab196e9ed9ade