This commit is contained in:
Yanlong Wang 2024-04-12 10:59:37 +08:00
parent 629ab270be
commit 78c8444096
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 18 additions and 17 deletions

View File

@ -18,7 +18,8 @@
"from-preset": "npm run build && npm run emu:reset && npm run emu:start",
"start": "npm run shell",
"deploy": "firebase deploy --only functions",
"logs": "firebase functions:log"
"logs": "firebase functions:log",
"gcp-build": "npx puppeteer browsers install chrome"
},
"engines": {
"node": "18"

View File

@ -36,16 +36,16 @@ export class CrawlerHost extends RPCHost {
const formatted = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
urlSource: snapshot.href.trim(),
markdownContent: contentText.trim(),
url: snapshot.href.trim(),
content: contentText.trim(),
toString() {
return `Title: ${this.title}
URL Source: ${this.urlSource}
URL Source: ${this.url}
Markdown Content:
${contentText}
${this.content}
`;
}
};

View File

@ -145,7 +145,7 @@ function giveSnapshot() {
async *scrap(url: string, noCache: string | boolean = false) {
const parsedUrl = new URL(url);
parsedUrl.search = '';
// parsedUrl.search = '';
parsedUrl.hash = '';
const normalizedUrl = parsedUrl.toString().toLowerCase();
const digest = md5Hasher.hash(normalizedUrl);
@ -191,7 +191,17 @@ function giveSnapshot() {
page.on('snapshot', hdl);
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
.then(async (r) => {
.catch((err) => {
this.logger.warn(`Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
return Promise.reject(new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`,
cause: err,
}));
}).finally(async () => {
finalized = true;
if (!snapshot?.html) {
return;
}
screenshot = await page.screenshot({
type: 'jpeg',
quality: 85,
@ -210,16 +220,6 @@ function giveSnapshot() {
).catch((err) => {
this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) });
});
return r;
}).catch((err) => {
this.logger.warn(`Failed to goto ${url}`, { err: marshalErrorLike(err) });
return Promise.reject(new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`,
cause: err,
}));
}).finally(() => {
finalized = true;
});
try {