mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-15 19:22:20 +08:00
feat: web search (#57)
This commit is contained in:
parent
f171e54ac9
commit
2e3c217479
3
backend/.gitignore
vendored
3
backend/.gitignore
vendored
|
@ -75,4 +75,5 @@ build/
|
|||
.DS_Store
|
||||
|
||||
*.local
|
||||
.secret.*
|
||||
.secret.*
|
||||
licensed/
|
11
backend/functions/integrity-check.cjs
Executable file
11
backend/functions/integrity-check.cjs
Executable file
|
@ -0,0 +1,11 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb');
|
||||
|
||||
if (!fs.existsSync(file)) {
|
||||
console.error(`Integrity check failed: ${file} does not exist.`);
|
||||
process.exit(1);
|
||||
}
|
31
backend/functions/package-lock.json
generated
31
backend/functions/package-lock.json
generated
|
@ -24,6 +24,7 @@
|
|||
"htmlparser2": "^9.0.0",
|
||||
"jose": "^5.1.0",
|
||||
"langdetect": "^0.2.1",
|
||||
"maxmind": "^4.3.18",
|
||||
"minio": "^7.1.3",
|
||||
"openai": "^4.20.0",
|
||||
"puppeteer": "^22.7.1",
|
||||
|
@ -8144,6 +8145,19 @@
|
|||
"tmpl": "1.0.5"
|
||||
}
|
||||
},
|
||||
"node_modules/maxmind": {
|
||||
"version": "4.3.18",
|
||||
"resolved": "https://registry.npmjs.org/maxmind/-/maxmind-4.3.18.tgz",
|
||||
"integrity": "sha512-5b9utU7ZxcGYTBaO7hCF0FXyfw3IpankLn+FnLW4RZS1zi97RBeSdfXJFJlk5UxNsMiFZlsdMT3lzvD+bD8MLQ==",
|
||||
"dependencies": {
|
||||
"mmdb-lib": "2.1.0",
|
||||
"tiny-lru": "11.2.5"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=12",
|
||||
"npm": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/media-typer": {
|
||||
"version": "0.3.0",
|
||||
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
|
||||
|
@ -8375,6 +8389,15 @@
|
|||
"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
|
||||
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
|
||||
},
|
||||
"node_modules/mmdb-lib": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/mmdb-lib/-/mmdb-lib-2.1.0.tgz",
|
||||
"integrity": "sha512-tdDTZmnI5G4UoSctv2KxM/3VQt2XRj4CmR5R4VsAWsOUcS3LysHR34wtixWm/pXxXdkBDuN92auxkC0T2+qd1Q==",
|
||||
"engines": {
|
||||
"node": ">=10",
|
||||
"npm": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/mongodb": {
|
||||
"version": "5.9.2",
|
||||
"resolved": "https://registry.npmjs.org/mongodb/-/mongodb-5.9.2.tgz",
|
||||
|
@ -11059,6 +11082,14 @@
|
|||
"resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.13.tgz",
|
||||
"integrity": "sha512-JaL9ZnvTbGFMDIBeGdVkLt4qWTeCPw+n7Ock+wceAGRenuHA6nOOvMJFliNDyXsjg2osGKJWsXtO2xc74VxyDw=="
|
||||
},
|
||||
"node_modules/tiny-lru": {
|
||||
"version": "11.2.5",
|
||||
"resolved": "https://registry.npmjs.org/tiny-lru/-/tiny-lru-11.2.5.tgz",
|
||||
"integrity": "sha512-JpqM0K33lG6iQGKiigcwuURAKZlq6rHXfrgeL4/I8/REoyJTGU+tEMszvT/oTRVHG2OiylhGDjqPp1jWMlr3bw==",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/tld-extract": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz",
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"name": "reader",
|
||||
"scripts": {
|
||||
"lint": "eslint --ext .js,.ts .",
|
||||
"build": "tsc -p .",
|
||||
"build": "node ./integrity-check.cjs && tsc -p .",
|
||||
"build:watch": "tsc --watch",
|
||||
"build:clean": "rm -rf ./build",
|
||||
"shell": "npm run build && firebase functions:shell",
|
||||
|
@ -44,6 +44,7 @@
|
|||
"htmlparser2": "^9.0.0",
|
||||
"jose": "^5.1.0",
|
||||
"langdetect": "^0.2.1",
|
||||
"maxmind": "^4.3.18",
|
||||
"minio": "^7.1.3",
|
||||
"openai": "^4.20.0",
|
||||
"puppeteer": "^22.7.1",
|
||||
|
|
|
@ -2,7 +2,7 @@ import {
|
|||
assignTransferProtocolMeta, marshalErrorLike,
|
||||
RPCHost, RPCReflection,
|
||||
HashManager,
|
||||
AssertionFailureError, ParamValidationError,
|
||||
AssertionFailureError, ParamValidationError, Defer,
|
||||
} from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||
|
@ -34,6 +34,12 @@ export class CrawlerHost extends RPCHost {
|
|||
cacheValidMs = 1000 * 300;
|
||||
urlValidMs = 1000 * 3600 * 4;
|
||||
|
||||
indexText = `[Usage1] https://r.jina.ai/YOUR_URL
|
||||
[Usage2] https://s.jina.ai/YOUR_SEARCH_QUERY
|
||||
[Homepage] https://jina.ai/reader
|
||||
[Source code] https://github.com/jina-ai/reader
|
||||
`;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected puppeteerControl: PuppeteerControl,
|
||||
|
@ -357,10 +363,7 @@ ${this.content}
|
|||
[Balance left] ${latestUser.wallet.total_balance}
|
||||
` : '';
|
||||
|
||||
return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
|
||||
[Homepage] https://jina.ai/reader
|
||||
[Source code] https://github.com/jina-ai/reader
|
||||
${authMixin}`,
|
||||
return assignTransferProtocolMeta(`${this.indexText}${authMixin}`,
|
||||
{ contentType: 'text/plain', envelope: null }
|
||||
);
|
||||
}
|
||||
|
@ -638,13 +641,13 @@ ${authMixin}`,
|
|||
return r;
|
||||
}
|
||||
|
||||
async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) {
|
||||
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) {
|
||||
let cache;
|
||||
if (!noCache && !crawlOpts.cookies?.length) {
|
||||
if (!noCache && !crawlOpts?.cookies?.length) {
|
||||
cache = await this.queryCache(urlToCrawl);
|
||||
}
|
||||
|
||||
if (cache?.isFresh && (!crawlOpts.favorScreenshot || (crawlOpts.favorScreenshot && cache?.screenshotAvailable))) {
|
||||
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
||||
yield cache.snapshot;
|
||||
|
||||
return;
|
||||
|
@ -683,4 +686,47 @@ ${authMixin}`,
|
|||
return undefined;
|
||||
}
|
||||
|
||||
|
||||
async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) {
|
||||
const iterators = urls.map((url) => this.cachedScrap(url, options, noCache));
|
||||
|
||||
const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined);
|
||||
|
||||
let nextDeferred = Defer();
|
||||
let concluded = false;
|
||||
|
||||
const handler = async (it: AsyncGenerator<PageSnapshot | undefined>, idx: number) => {
|
||||
for await (const x of it) {
|
||||
results[idx] = x;
|
||||
|
||||
if (x) {
|
||||
nextDeferred.resolve();
|
||||
nextDeferred = Defer();
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
Promise.all(
|
||||
iterators.map((it, idx) => handler(it, idx))
|
||||
).finally(() => {
|
||||
concluded = true;
|
||||
nextDeferred.resolve();
|
||||
});
|
||||
|
||||
yield results;
|
||||
|
||||
try {
|
||||
while (!concluded) {
|
||||
await nextDeferred.promise;
|
||||
|
||||
yield results;
|
||||
}
|
||||
} finally {
|
||||
for (const x of iterators) {
|
||||
x.return();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
389
backend/functions/src/cloud-functions/searcher.ts
Normal file
389
backend/functions/src/cloud-functions/searcher.ts
Normal file
|
@ -0,0 +1,389 @@
|
|||
import {
|
||||
assignTransferProtocolMeta, marshalErrorLike,
|
||||
RPCHost, RPCReflection,
|
||||
AssertionFailureError,
|
||||
objHashMd5B64Of,
|
||||
} from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||
import { RateLimitControl } from '../shared/services/rate-limit';
|
||||
import _ from 'lodash';
|
||||
import { ScrappingOptions } from '../services/puppeteer';
|
||||
import { Request, Response } from 'express';
|
||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||
import { BraveSearchService } from '../services/brave-search';
|
||||
import { CrawlerHost } from './crawler';
|
||||
import { CookieParam } from 'puppeteer';
|
||||
|
||||
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
||||
import { SearchResult } from '../db/searched';
|
||||
import { WebSearchApiResponse } from '../shared/3rd-party/brave-types';
|
||||
|
||||
|
||||
@singleton()
|
||||
export class SearcherHost extends RPCHost {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
||||
cacheValidMs = 1000 * 3600;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected rateLimitControl: RateLimitControl,
|
||||
protected threadLocal: AsyncContext,
|
||||
protected braveSearchService: BraveSearchService,
|
||||
protected crawler: CrawlerHost,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
@CloudHTTPv2({
|
||||
name: 'search2',
|
||||
runtime: {
|
||||
memory: '4GiB',
|
||||
timeoutSeconds: 300,
|
||||
concurrency: 4,
|
||||
},
|
||||
tags: ['Searcher'],
|
||||
httpMethod: ['get', 'post'],
|
||||
returnType: [String, OutputServerEventStream],
|
||||
exposeRoot: true,
|
||||
})
|
||||
@CloudHTTPv2({
|
||||
runtime: {
|
||||
memory: '8GiB',
|
||||
timeoutSeconds: 300,
|
||||
concurrency: 8,
|
||||
maxInstances: 200,
|
||||
},
|
||||
openapi: {
|
||||
operation: {
|
||||
parameters: {
|
||||
'Accept': {
|
||||
description: `Specifies your preference for the response format. \n\n` +
|
||||
`Supported formats:\n` +
|
||||
`- text/event-stream\n` +
|
||||
`- application/json or text/json\n` +
|
||||
`- text/plain`
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-No-Cache': {
|
||||
description: `Ignores internal cache if this header is specified with a value.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Respond-With': {
|
||||
description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` +
|
||||
`Supported formats:\n` +
|
||||
`- markdown\n` +
|
||||
`- html\n` +
|
||||
`- text\n` +
|
||||
`- screenshot\n`
|
||||
,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Proxy-Url': {
|
||||
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
|
||||
`Supported protocols:\n` +
|
||||
`- http\n` +
|
||||
`- https\n` +
|
||||
`- socks4\n` +
|
||||
`- socks5\n\n` +
|
||||
`For authentication, https://user:pass@host:port`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Set-Cookie': {
|
||||
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
||||
`Syntax is the same with standard Set-Cookie`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-With-Generated-Alt': {
|
||||
description: `Enable automatic alt-text generating for images without an meaningful alt-text.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
tags: ['Searcher'],
|
||||
httpMethod: ['get', 'post'],
|
||||
returnType: [String, OutputServerEventStream],
|
||||
exposeRoot: true,
|
||||
})
|
||||
async search(
|
||||
@RPCReflect() rpcReflect: RPCReflection,
|
||||
@Ctx() ctx: {
|
||||
req: Request,
|
||||
res: Response,
|
||||
},
|
||||
auth: JinaEmbeddingsAuthDTO
|
||||
) {
|
||||
const uid = await auth.solveUID();
|
||||
let chargeAmount = 0;
|
||||
const noSlashPath = ctx.req.url.slice(1);
|
||||
if (!noSlashPath) {
|
||||
const latestUser = uid ? await auth.assertUser() : undefined;
|
||||
const authMixin = latestUser ? `
|
||||
[Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
|
||||
[Balance left] ${latestUser.wallet.total_balance}
|
||||
` : '';
|
||||
|
||||
return assignTransferProtocolMeta(`${this.crawler.indexText}${authMixin}`,
|
||||
{ contentType: 'text/plain', envelope: null }
|
||||
);
|
||||
}
|
||||
|
||||
if (uid) {
|
||||
const user = await auth.assertUser();
|
||||
if (!(user.wallet.total_balance > 0)) {
|
||||
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
|
||||
}
|
||||
|
||||
await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
|
||||
[
|
||||
// 1000 requests per minute
|
||||
new Date(Date.now() - 60 * 1000), 1000
|
||||
]
|
||||
);
|
||||
|
||||
rpcReflect.finally(() => {
|
||||
if (chargeAmount) {
|
||||
auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
|
||||
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
||||
});
|
||||
}
|
||||
});
|
||||
} else if (ctx.req.ip) {
|
||||
this.threadLocal.set('ip', ctx.req.ip);
|
||||
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
|
||||
[
|
||||
// 100 requests per minute
|
||||
new Date(Date.now() - 60 * 1000), 100
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
const customMode = ctx.req.get('x-respond-with') || 'default';
|
||||
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
|
||||
const noCache = Boolean(ctx.req.get('x-no-cache'));
|
||||
const cookies: CookieParam[] = [];
|
||||
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
|
||||
if (Array.isArray(setCookieHeaders)) {
|
||||
for (const setCookie of setCookieHeaders) {
|
||||
cookies.push({
|
||||
...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
|
||||
});
|
||||
}
|
||||
} else if (setCookieHeaders) {
|
||||
cookies.push({
|
||||
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
|
||||
});
|
||||
}
|
||||
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
|
||||
const crawlOpts: ScrappingOptions = {
|
||||
proxyUrl: ctx.req.get('x-proxy-url'),
|
||||
cookies,
|
||||
favorScreenshot: customMode === 'screenshot'
|
||||
};
|
||||
|
||||
const searchQuery = noSlashPath;
|
||||
const r = await this.cachedWebSearch({
|
||||
q: searchQuery,
|
||||
count: 5
|
||||
});
|
||||
|
||||
const urls = r.web.results.map((x) => new URL(x.url));
|
||||
const it = this.fetchSearchResults(customMode, urls, crawlOpts, noCache);
|
||||
|
||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||
const sseStream = new OutputServerEventStream();
|
||||
rpcReflect.return(sseStream);
|
||||
|
||||
try {
|
||||
for await (const scrapped of it) {
|
||||
if (!scrapped) {
|
||||
continue;
|
||||
}
|
||||
|
||||
chargeAmount = this.getChargeAmount(scrapped);
|
||||
sseStream.write({
|
||||
event: 'data',
|
||||
data: scrapped,
|
||||
});
|
||||
}
|
||||
} catch (err: any) {
|
||||
this.logger.error(`Failed to collect search result for query ${searchQuery}`,
|
||||
{ err: marshalErrorLike(err) }
|
||||
);
|
||||
sseStream.write({
|
||||
event: 'error',
|
||||
data: marshalErrorLike(err),
|
||||
});
|
||||
}
|
||||
|
||||
sseStream.end();
|
||||
|
||||
return sseStream;
|
||||
}
|
||||
|
||||
let lastScrapped;
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
for await (const scrapped of it) {
|
||||
lastScrapped = scrapped;
|
||||
|
||||
if (!this.qualified(scrapped)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
chargeAmount = this.getChargeAmount(scrapped);
|
||||
|
||||
return scrapped;
|
||||
}
|
||||
|
||||
if (!lastScrapped) {
|
||||
throw new AssertionFailureError(`No content available for query ${searchQuery}`);
|
||||
}
|
||||
|
||||
chargeAmount = this.getChargeAmount(lastScrapped);
|
||||
|
||||
return lastScrapped;
|
||||
}
|
||||
|
||||
for await (const scrapped of it) {
|
||||
lastScrapped = scrapped;
|
||||
|
||||
if (!this.qualified(scrapped)) {
|
||||
continue;
|
||||
}
|
||||
chargeAmount = this.getChargeAmount(scrapped);
|
||||
|
||||
return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
|
||||
}
|
||||
|
||||
if (!lastScrapped) {
|
||||
throw new AssertionFailureError(`No content available for query ${searchQuery}`);
|
||||
}
|
||||
|
||||
chargeAmount = this.getChargeAmount(lastScrapped);
|
||||
|
||||
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
||||
}
|
||||
|
||||
async *fetchSearchResults(mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
||||
urls: URL[], options?: ScrappingOptions, noCache = false) {
|
||||
|
||||
for await (const scrapped of this.crawler.scrapMany(urls, options, noCache)) {
|
||||
const mapped = scrapped.map((x, i) => {
|
||||
if (!x) {
|
||||
const p = {
|
||||
toString() {
|
||||
return `[${i + 1}] No content available for ${urls[i]}`;
|
||||
}
|
||||
};
|
||||
const r = Object.create(p);
|
||||
r.url = urls[i].toString();
|
||||
|
||||
return r;
|
||||
}
|
||||
return this.crawler.formatSnapshot(mode, x, urls[i]);
|
||||
});
|
||||
|
||||
const resultArray = await Promise.all(mapped);
|
||||
for (const [i, result] of resultArray.entries()) {
|
||||
if (result && typeof result === 'object' && Object.hasOwn(result, 'toString')) {
|
||||
result.toString = function (this: any) {
|
||||
const mixins = [];
|
||||
if (this.publishedTime) {
|
||||
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
|
||||
}
|
||||
|
||||
if (mode === 'markdown') {
|
||||
return `[${i + 1}]\n${this.content}`;
|
||||
}
|
||||
|
||||
return `[${i + 1}] Title: ${this.title}
|
||||
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
|
||||
[${i + 1}] Markdown Content:
|
||||
${this.content}
|
||||
`;
|
||||
};
|
||||
}
|
||||
}
|
||||
resultArray.toString = function () {
|
||||
return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${urls[i]}`).join('\n\n').trimEnd() + '\n';
|
||||
};
|
||||
|
||||
yield resultArray;
|
||||
}
|
||||
}
|
||||
|
||||
getChargeAmount(formatted: any[]) {
|
||||
return _.sum(
|
||||
formatted.map((x) => this.crawler.getChargeAmount(x) || 0)
|
||||
);
|
||||
}
|
||||
|
||||
qualified(scrapped: any[]) {
|
||||
return _.every(scrapped, (x) =>
|
||||
(x as any)?.title &&
|
||||
(
|
||||
(x as any).content ||
|
||||
(x as any).screenShotUrl ||
|
||||
(x as any).screenshot ||
|
||||
(x as any).text ||
|
||||
(x as any).html
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
|
||||
const queryDigest = objHashMd5B64Of(query);
|
||||
let cache;
|
||||
if (!noCache) {
|
||||
cache = (await SearchResult.fromFirestoreQuery(
|
||||
SearchResult.COLLECTION.where('queryDigest', '==', queryDigest)
|
||||
.orderBy('createdAt', 'desc')
|
||||
.limit(1)
|
||||
))[0];
|
||||
if (cache) {
|
||||
const age = Date.now() - cache.createdAt.valueOf();
|
||||
const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
|
||||
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
|
||||
query, digest: queryDigest, age, stale
|
||||
});
|
||||
|
||||
if (!stale) {
|
||||
return cache.response as WebSearchApiResponse;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const r = await this.braveSearchService.webSearch(query);
|
||||
|
||||
const nowDate = new Date();
|
||||
const record = SearchResult.from({
|
||||
query,
|
||||
queryDigest,
|
||||
response: r,
|
||||
createdAt: nowDate,
|
||||
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
|
||||
});
|
||||
SearchResult.save(record).catch((err) => {
|
||||
this.logger.warn(`Failed to cache search result`, { err });
|
||||
});
|
||||
|
||||
return r;
|
||||
}
|
||||
}
|
60
backend/functions/src/db/searched.ts
Normal file
60
backend/functions/src/db/searched.ts
Normal file
|
@ -0,0 +1,60 @@
|
|||
import { Also, parseJSONText, Prop } from 'civkit';
|
||||
import { FirestoreRecord } from '../shared/lib/firestore';
|
||||
import _ from 'lodash';
|
||||
|
||||
@Also({
|
||||
dictOf: Object
|
||||
})
|
||||
export class SearchResult extends FirestoreRecord {
|
||||
static override collectionName = 'searchResults';
|
||||
|
||||
override _id!: string;
|
||||
|
||||
@Prop({
|
||||
required: true
|
||||
})
|
||||
query!: any;
|
||||
|
||||
@Prop({
|
||||
required: true
|
||||
})
|
||||
queryDigest!: string;
|
||||
|
||||
@Prop()
|
||||
response?: any;
|
||||
|
||||
@Prop()
|
||||
createdAt!: Date;
|
||||
|
||||
@Prop()
|
||||
expireAt?: Date;
|
||||
|
||||
[k: string]: any;
|
||||
|
||||
static patchedFields = [
|
||||
'query',
|
||||
'response',
|
||||
];
|
||||
|
||||
static override from(input: any) {
|
||||
for (const field of this.patchedFields) {
|
||||
if (typeof input[field] === 'string') {
|
||||
input[field] = parseJSONText(input[field]);
|
||||
}
|
||||
}
|
||||
|
||||
return super.from(input) as SearchResult;
|
||||
}
|
||||
|
||||
override degradeForFireStore() {
|
||||
const copy: any = { ...this };
|
||||
|
||||
for (const field of (this.constructor as typeof SearchResult).patchedFields) {
|
||||
if (typeof copy[field] === 'object') {
|
||||
copy[field] = JSON.stringify(copy[field]) as any;
|
||||
}
|
||||
}
|
||||
|
||||
return copy;
|
||||
}
|
||||
}
|
71
backend/functions/src/services/brave-search.ts
Normal file
71
backend/functions/src/services/brave-search.ts
Normal file
|
@ -0,0 +1,71 @@
|
|||
import { AsyncService, DownstreamServiceFailureError } from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { SecretExposer } from '../shared/services/secrets';
|
||||
import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
||||
import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
|
||||
import { AsyncContext } from '../shared';
|
||||
import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
|
||||
|
||||
@singleton()
|
||||
export class BraveSearchService extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
braveSearchHTTP!: BraveSearchHTTP;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected secretExposer: SecretExposer,
|
||||
protected geoipControl: GeoIPService,
|
||||
protected threadLocal: AsyncContext,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
this.emit('ready');
|
||||
|
||||
this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
|
||||
}
|
||||
|
||||
async webSearch(query: WebSearchQueryParams) {
|
||||
const ip = this.threadLocal.get('ip');
|
||||
const extraHeaders: WebSearchOptionalHeaderOptions = {};
|
||||
if (ip) {
|
||||
const geoip = await this.geoipControl.lookupCity(ip, GEOIP_SUPPORTED_LANGUAGES.EN);
|
||||
|
||||
if (geoip?.city) {
|
||||
extraHeaders['X-Loc-City'] = geoip.city;
|
||||
}
|
||||
if (geoip?.country) {
|
||||
extraHeaders['X-Loc-Country'] = geoip.country.code;
|
||||
}
|
||||
if (geoip?.timezone) {
|
||||
extraHeaders['X-Loc-Timezone'] = geoip.timezone;
|
||||
}
|
||||
if (geoip?.coordinates) {
|
||||
extraHeaders['X-Loc-Lat'] = `${geoip.coordinates[0]}`;
|
||||
extraHeaders['X-Loc-Long'] = `${geoip.coordinates[1]}`;
|
||||
}
|
||||
if (geoip?.subdivisions?.length) {
|
||||
extraHeaders['X-Loc-State'] = geoip.subdivisions[0].code;
|
||||
extraHeaders['X-Loc-State-Name'] = geoip.subdivisions[0].name;
|
||||
}
|
||||
}
|
||||
if (this.threadLocal.get('userAgent')) {
|
||||
extraHeaders['User-Agent'] = this.threadLocal.get('userAgent');
|
||||
}
|
||||
|
||||
try {
|
||||
const r = await this.braveSearchHTTP.webSearch(query, { headers: extraHeaders as Record<string, string> });
|
||||
|
||||
return r.parsed;
|
||||
} catch (err) {
|
||||
throw new DownstreamServiceFailureError({ message: `Search failed`, cause: err });
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
123
backend/functions/src/services/geoip.ts
Normal file
123
backend/functions/src/services/geoip.ts
Normal file
|
@ -0,0 +1,123 @@
|
|||
import { container, singleton } from 'tsyringe';
|
||||
import fsp from 'fs/promises';
|
||||
import { CityResponse, Reader } from 'maxmind';
|
||||
import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
|
||||
import { Logger } from '../shared';
|
||||
import path from 'path';
|
||||
|
||||
export enum GEOIP_SUPPORTED_LANGUAGES {
|
||||
EN = 'en',
|
||||
ZH_CN = 'zh-CN',
|
||||
JA = 'ja',
|
||||
DE = 'de',
|
||||
FR = 'fr',
|
||||
ES = 'es',
|
||||
PT_BR = 'pt-BR',
|
||||
RU = 'ru',
|
||||
}
|
||||
|
||||
export class GeoIPInfo extends AutoCastable {
|
||||
@Prop()
|
||||
code?: string;
|
||||
|
||||
@Prop()
|
||||
name?: string;
|
||||
}
|
||||
|
||||
export class GeoIPCountryInfo extends GeoIPInfo {
|
||||
@Prop()
|
||||
eu?: boolean;
|
||||
}
|
||||
|
||||
export class GeoIPCityResponse extends AutoCastable {
|
||||
@Prop()
|
||||
continent?: GeoIPInfo;
|
||||
|
||||
@Prop()
|
||||
country?: GeoIPCountryInfo;
|
||||
|
||||
@Prop({
|
||||
arrayOf: GeoIPInfo
|
||||
})
|
||||
subdivisions?: GeoIPInfo[];
|
||||
|
||||
@Prop()
|
||||
city?: string;
|
||||
|
||||
@Prop({
|
||||
arrayOf: Number
|
||||
})
|
||||
coordinates?: [number, number, number];
|
||||
|
||||
@Prop()
|
||||
timezone?: string;
|
||||
}
|
||||
|
||||
@singleton()
|
||||
export class GeoIPService extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
mmdbCity!: Reader<CityResponse>;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
@runOnce()
|
||||
async _lazyload() {
|
||||
const mmdpPath = path.resolve(__dirname, '..', '..', 'licensed', 'GeoLite2-City.mmdb');
|
||||
|
||||
const dbBuff = await fsp.readFile(mmdpPath, { flag: 'r', encoding: null });
|
||||
|
||||
this.mmdbCity = new Reader<CityResponse>(dbBuff);
|
||||
|
||||
this.logger.info(`Loaded GeoIP database, ${dbBuff.byteLength} bytes`);
|
||||
}
|
||||
|
||||
|
||||
async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
|
||||
await this._lazyload();
|
||||
|
||||
const r = this.mmdbCity.get(ip);
|
||||
|
||||
if (!r) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
return GeoIPCityResponse.from({
|
||||
continent: r.continent ? {
|
||||
code: r.continent?.code,
|
||||
name: r.continent?.names?.[lang] || r.continent?.names?.en,
|
||||
} : undefined,
|
||||
country: r.country ? {
|
||||
code: r.country?.iso_code,
|
||||
name: r.country?.names?.[lang] || r.country?.names.en,
|
||||
eu: r.country?.is_in_european_union,
|
||||
} : undefined,
|
||||
city: r.city?.names?.[lang] || r.city?.names?.en,
|
||||
subdivisions: r.subdivisions?.map((x) => ({
|
||||
code: x.iso_code,
|
||||
name: x.names?.[lang] || x.names?.en,
|
||||
})),
|
||||
coordinates: r.location ? [
|
||||
r.location.latitude, r.location.longitude, r.location.accuracy_radius
|
||||
] : undefined,
|
||||
timezone: r.location?.time_zone,
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const instance = container.resolve(GeoIPService);
|
||||
|
||||
export default instance;
|
|
@ -278,7 +278,7 @@ document.addEventListener('load', handlePageLoad);
|
|||
return page;
|
||||
}
|
||||
|
||||
async *scrap(parsedUrl: URL, options: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
||||
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
||||
// parsedUrl.search = '';
|
||||
const url = parsedUrl.toString();
|
||||
|
||||
|
@ -287,10 +287,10 @@ document.addEventListener('load', handlePageLoad);
|
|||
let screenshot: Buffer | undefined;
|
||||
|
||||
const page = await this.pagePool.acquire();
|
||||
if (options.proxyUrl) {
|
||||
if (options?.proxyUrl) {
|
||||
await page.useProxy(options.proxyUrl);
|
||||
}
|
||||
if (options.cookies) {
|
||||
if (options?.cookies) {
|
||||
await page.setCookie(...options.cookies);
|
||||
}
|
||||
|
||||
|
@ -353,7 +353,7 @@ document.addEventListener('load', handlePageLoad);
|
|||
yield { ...snapshot, screenshot } as PageSnapshot;
|
||||
break;
|
||||
}
|
||||
if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||
screenshot = await page.screenshot();
|
||||
lastHTML = snapshot.html;
|
||||
}
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 584791b789cd483dab18735416744b4d10130993
|
||||
Subproject commit 2f2cdcff7b2738be33ee5aca858ef2d65eba29ed
|
Loading…
Reference in New Issue
Block a user