feat: add image captioning (#6)

* Fix contentText assignment in CrawlerHost class

* fix: recover vscode configurations

* feat: add image captioning

* feat: add image captioning

* clean: vscode config

* chore: fix some ts warnings

* feat: auto alt text

* fix

* chore: improve prompt

* clean: unused config

* fix: failure condition

* fix: remove redundant code

* fix: catch parse error

* fix: catch parse error

---------

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
This commit is contained in:
Han Xiao 2024-04-15 20:51:31 -07:00 committed by GitHub
parent 18373626b2
commit b3fb4c5c57
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 502 additions and 37 deletions

4
.gitignore vendored
View File

@ -1,4 +1,2 @@
node_modules/
.DS_Store
.vscode
.cache
.DS_Store

10
.vscode/exensions.json vendored Normal file
View File

@ -0,0 +1,10 @@
{
"recommendations": [
"editorconfig.editorconfig",
"octref.vetur",
"redhat.vscode-yaml",
"dbaeumer.vscode-eslint",
"esbenp.prettier-vscode",
"streetsidesoftware.code-spell-checker"
]
}

60
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,60 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Debug Fullstack: attach",
"request": "attach",
"cwd": "${workspaceFolder}/backend/functions",
"skipFiles": [
"<node_internals>/**"
],
"type": "node",
"preLaunchTask": "Fullstack:debug"
},
{
"name": "Debug Fullstack: attach: with proxy",
"request": "attach",
"cwd": "${workspaceFolder}/backend/functions",
"skipFiles": [
"<node_internals>/**"
],
"type": "node",
"preLaunchTask": "Fullstack:debug:with-proxy"
},
{
"name": "Attach",
"port": 9229,
"request": "attach",
"skipFiles": [
"<node_internals>/**"
],
"type": "node"
},
{
"name": "Attach by Process ID",
"processId": "${command:PickProcess}",
"request": "attach",
"skipFiles": [
"<node_internals>/**"
],
"type": "node"
},
{
"name": "Debug Fullstack",
"request": "launch",
"runtimeArgs": [
"emulators:start",
"--import=../.firebase-emu",
"--export-on-exit=../.firebase-emu",
],
"cwd": "${workspaceFolder}/backend/functions",
"runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase",
"skipFiles": [
"<node_internals>/**"
],
"type": "node",
"preLaunchTask": "Fullstack:prepare",
"killBehavior": "polite"
},
]
}

32
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,32 @@
{
"editor.wordWrap": "on",
"editor.wordWrapColumn": 120,
"files.trimTrailingWhitespace": true,
"files.trimFinalNewlines": true,
"[javascript]": {
"editor.defaultFormatter": "vscode.typescript-language-features"
},
"[jsonc]": {
"editor.defaultFormatter": "vscode.json-language-features"
},
"[typescript]": {
"editor.defaultFormatter": "vscode.typescript-language-features"
},
"[json]": {
"editor.defaultFormatter": "vscode.json-language-features"
},
"[yaml]": {
"editor.defaultFormatter": "redhat.vscode-yaml"
},
"[markdown]": {
"files.trimTrailingWhitespace": false
},
"typescript.tsdk": "node_modules/typescript/lib",
"typescript.preferences.quoteStyle": "single",
"typescript.format.semicolons": "insert",
"typescript.preferences.importModuleSpecifier": "project-relative",
"typescript.locale": "en",
"cSpell.enabled": true,
"cSpell.words": [
],
}

156
.vscode/tasks.json vendored Normal file
View File

@ -0,0 +1,156 @@
{
"version": "2.0.0",
"tasks": [
{
"type": "npm",
"script": "build",
"group": "build",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
},
"problemMatcher": [],
"label": "Backend:rebuild",
"detail": "Backend:rebuild"
},
{
"type": "npm",
"script": "emu:reset",
"group": "build",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
},
"problemMatcher": [],
"label": "Backend:reset-emulator",
"detail": "Backend:reset-emulator"
},
{
"type": "typescript",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
},
"tsconfig": "backend/functions/tsconfig.json",
"option": "watch",
"isBackground": true,
"problemMatcher": [
"$tsc-watch"
],
"group": "build",
"label": "Backend:build:watch"
},
{
"type": "npm",
"script": "emu:debug",
"group": "none",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
},
"problemMatcher": [
{
"base": "$tsc",
"background": {
"activeOnStart": false,
"beginsPattern": "shutdown requested|Starting emulators",
"endsPattern": "Debugger listening"
}
}
],
"label": "Backend:start-emulator-debug",
"detail": "Backend:start-emulator-debug",
"dependsOn": [
"Backend:build:watch"
],
"isBackground": true,
},
{
"type": "npm",
"script": "dev",
"options": {
"cwd": "${workspaceFolder}/webapp",
},
"group": "build",
"label": "Frontend:start:dev",
"detail": "Frontend:start:dev",
"isBackground": true,
"problemMatcher": {
"base": "$vite",
"background": {
"activeOnStart": true,
"endsPattern": "OK",
"beginsPattern": "vite"
}
},
},
{
"type": "npm",
"script": "dev",
"options": {
"cwd": "${workspaceFolder}/webapp",
"env": {
"FIREBASE_EMULATE": "true",
}
},
"group": "build",
"label": "Frontend:start:emu",
"detail": "Frontend:start:emu",
"isBackground": true,
"problemMatcher": {
"base": "$vite",
"background": {
"activeOnStart": true,
"endsPattern": "OK",
"beginsPattern": "vite"
}
},
},
{
"type": "npm",
"script": "emu:debug2",
"group": "none",
"options": {
"cwd": "${workspaceFolder}/backend/functions",
"env": {
"https_proxy": "http://127.0.0.1:7890",
"http_proxy": "http://127.0.0.1:7890",
"all_proxy": "socks5://127.0.0.1:7890"
}
},
"problemMatcher": [
{
"base": "$tsc",
"background": {
"activeOnStart": false,
"beginsPattern": "shutdown requested|Starting emulators",
"endsPattern": "Debugger listening"
}
}
],
"label": "Backend:start-emulator-debug:with-proxy",
"detail": "Backend:start-emulator-debug:with-proxy",
"dependsOn": [
"Backend:build:watch"
],
"isBackground": true,
},
{
"label": "Fullstack:prepare",
"dependsOn": [
"Frontend:start:emu",
"Backend:build:watch",
],
},
{
"label": "Fullstack:debug",
"dependsOn": [
// "Frontend:start:emu",
"Backend:start-emulator-debug",
],
},
{
"label": "Fullstack:debug:with-proxy",
"dependsOn": [
"Frontend:start:emu",
"Backend:start-emulator-debug:with-proxy",
],
}
]
}

View File

@ -1,14 +1,9 @@
const { join } = require('path');
let config = {};
if (!process.env.FUNCTIONS_EMULATOR) {
config = {
// Changes the cache location for Puppeteer.
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
};
}
/**
* @type {import("puppeteer").Configuration}
*/
module.exports = config;
module.exports = {
// Changes the cache location for Puppeteer.
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
};

View File

@ -3,9 +3,10 @@ import { singleton } from 'tsyringe';
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl } from '../services/puppeteer';
import TurnDownService from 'turndown';
import { Request, Response } from 'express';
import normalizeUrl from "@esm2cjs/normalize-url";
import { AltTextService } from '../services/alt-text';
import TurndownService from 'turndown';
function tidyMarkdown(markdown: string): string {
@ -50,11 +51,14 @@ function tidyMarkdown(markdown: string): string {
export class CrawlerHost extends RPCHost {
logger = this.globalLogger.child({ service: this.constructor.name });
turnDownService = new TurnDownService().use(require('turndown-plugin-gfm').gfm);
turnDownPlugins = [require('turndown-plugin-gfm').gfm];
imageShortUrlPrefix?: string;
constructor(
protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl,
protected altTextService: AltTextService,
) {
super(...arguments);
}
@ -65,14 +69,57 @@ export class CrawlerHost extends RPCHost {
this.emit('ready');
}
formatSnapshot(snapshot: PageSnapshot) {
async formatSnapshot(snapshot: PageSnapshot) {
const toBeTurnedToMd = snapshot.parsed?.content;
const turnedDown = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd).trim() : '';
let turnDownService = new TurndownService();
for (const plugin of this.turnDownPlugins) {
turnDownService = turnDownService.use(plugin);
}
const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
let contentText = '';
if (toBeTurnedToMd) {
const urlToAltMap: { [k: string]: { shortDigest: string, alt?: string; }; } = {};
const tasks = (snapshot.imgs || []).map(async (x) => {
const r = await this.altTextService.getAltTextAndShortDigest(x).catch((err)=> {
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
return undefined;
});
if (r) {
urlToAltMap[x.src.trim()] = r;
}
});
const cleanText = tidyMarkdown(contentText).trim();
await Promise.all(tasks);
let imgIdx = 0;
turnDownService.addRule('img-generated-alt', {
filter: 'img',
replacement: (_content, node) => {
const src = (node.getAttribute('src') || '').trim();
const alt = cleanAttribute(node.getAttribute('alt'));
if (!src) {
return '';
}
const mapped = urlToAltMap[src];
imgIdx++;
if (mapped) {
return `![Image ${imgIdx}: ${mapped.alt || alt}](${this.imageShortUrlPrefix ? `${this.imageShortUrlPrefix}/${mapped.shortDigest}` : src})`;
}
return `![Image ${imgIdx}: ${alt}](${src})`;
}
});
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
}
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
contentText = turnDownService.turndown(snapshot.html);
}
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
contentText = snapshot.text;
}
const cleanText = tidyMarkdown(contentText || '').trim();
const formatted = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
@ -142,7 +189,7 @@ ${this.content}
continue;
}
const formatted = this.formatSnapshot(scrapped);
const formatted = await this.formatSnapshot(scrapped);
if (scrapped.screenshot && screenshotEnabled) {
sseStream.write({
@ -177,7 +224,7 @@ ${this.content}
continue;
}
const formatted = this.formatSnapshot(scrapped);
const formatted = await this.formatSnapshot(scrapped);
return formatted;
}
@ -186,7 +233,7 @@ ${this.content}
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
}
return this.formatSnapshot(lastScrapped);
return await this.formatSnapshot(lastScrapped);
}
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
@ -195,7 +242,7 @@ ${this.content}
continue;
}
const formatted = this.formatSnapshot(scrapped);
const formatted = await this.formatSnapshot(scrapped);
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
}
@ -204,8 +251,12 @@ ${this.content}
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
}
return `${this.formatSnapshot(lastScrapped)}`;
return `${await this.formatSnapshot(lastScrapped)}`;
}
}
function cleanAttribute(attribute: string) {
return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
}

View File

@ -0,0 +1,42 @@
import { Also, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import _ from 'lodash';
@Also({
dictOf: Object
})
export class ImgAlt extends FirestoreRecord {
static override collectionName = 'imgAlts';
override _id!: string;
@Prop({
required: true
})
src!: string;
@Prop({
required: true
})
urlDigest!: string;
@Prop()
width?: number;
@Prop()
height?: number;
@Prop()
generatedAlt?: string;
@Prop()
originalAlt?: string;
@Prop()
createdAt!: Date;
@Prop()
expireAt?: Date;
[k: string]: any;
}

View File

@ -0,0 +1,91 @@
import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
import { singleton } from 'tsyringe';
import { Logger } from '../shared/services/logger';
import { CanvasService } from '../shared/services/canvas';
import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
import { ImgBrief } from './puppeteer';
import { ImgAlt } from '../db/img-alt';
const md5Hasher = new HashManager('md5', 'hex');
@singleton()
export class AltTextService extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(
protected globalLogger: Logger,
protected imageInterrogator: ImageInterrogationManager,
protected canvasService: CanvasService
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
async caption(url: string) {
try {
const img = await this.canvasService.loadImage(url);
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
const r = await this.imageInterrogator.interrogate('blip2', {
image: exported,
// prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
});
return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
} catch (err) {
throw new AssertionFailureError({ message: `Could not generate alt text for url ${url}`, cause: err });
}
}
async getAltTextAndShortDigest(imgBrief: ImgBrief) {
if (!imgBrief.src) {
return undefined;
}
const digest = md5Hasher.hash(imgBrief.src);
const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
const existing = await ImgAlt.fromFirestore(shortDigest);
if (existing?.generatedAlt) {
return {
shortDigest,
alt: existing.generatedAlt,
};
}
let generatedCaption;
if (!imgBrief.alt) {
try {
generatedCaption = await this.caption(imgBrief.src);
} catch (err) {
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
}
}
await ImgAlt.COLLECTION.doc(shortDigest).set(
{
_id: shortDigest,
src: imgBrief.src || '',
width: imgBrief.naturalWidth || 0,
height: imgBrief.naturalHeight || 0,
urlDigest: digest,
originalAlt: imgBrief.alt || '',
generatedAlt: generatedCaption || '',
createdAt: new Date()
}, { merge: true }
);
return {
shortDigest,
alt: generatedCaption,
};
}
}

View File

@ -7,11 +7,19 @@ import os from 'os';
import fs from 'fs';
import { Crawled } from '../db/crawled';
import puppeteer from 'puppeteer-extra';
import puppeteerStealth from 'puppeteer-extra-plugin-stealth';
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
export interface ImgBrief {
src: string;
loaded: boolean;
width: number;
height: number;
naturalWidth: number;
naturalHeight: number;
alt?: string;
}
export interface PageSnapshot {
title: string;
href: string;
@ -30,13 +38,16 @@ export interface PageSnapshot {
publishedTime: string;
} | null;
screenshot?: Buffer;
imgs?: ImgBrief[];
}
const md5Hasher = new HashManager('md5', 'hex');
const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
puppeteer.use(puppeteerStealth());
// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
// puppeteer.use(puppeteerUAOverride({
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
// platform: `Linux`,
// }))
@singleton()
@ -84,7 +95,7 @@ export class PuppeteerControl extends AsyncService {
this.browser = await puppeteer.launch({
headless: true,
timeout: 10_000
}).catch((err) => {
}).catch((err: any) => {
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
process.nextTick(() => {
this.emit('error', err);
@ -117,23 +128,42 @@ export class PuppeteerControl extends AsyncService {
}));
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
preparations.push(page.evaluateOnNewDocument(`
function briefImgs(elem) {
const imageTags = Array.from((elem || document).querySelectorAll('img[src]'));
return imageTags.map((x)=> ({
src: x.src,
loaded: x.complete,
width: x.width,
height: x.height,
naturalWidth: x.naturalWidth,
naturalHeight: x.naturalHeight,
alt: x.alt || x.title,
}));
}
function giveSnapshot() {
let parsedContent;
let parsed;
try {
// Attempt to parse the cloned document
parsedContent = new Readability(document.cloneNode(true)).parse();
} catch (error) {
// If an error occurs, log it and set parsedContent to undefined
parsedContent = undefined;
parsed = new Readability(document.cloneNode(true)).parse();
} catch (err) {
void 0;
}
return {
const r = {
title: document.title,
href: document.location.href,
html: document.documentElement.outerHTML,
text: document.body.innerText,
parsed: parsedContent
parsed: parsed,
imgs: [],
};
if (parsed && parsed.content) {
const elem = document.createElement('div');
elem.innerHTML = parsed.content;
r.imgs = briefImgs(elem);
}
return r;
}
`));
preparations.push(page.evaluateOnNewDocument(() => {

@ -1 +1 @@
Subproject commit 9f0fa1dd7f8cfcea4c8d79252319b151fae6ed19
Subproject commit bea967a371581c1109dc0101dbcab196e9ed9ade