diff --git a/backend/functions/package.json b/backend/functions/package.json index 8f81e54..c6b8b85 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -18,7 +18,8 @@ "from-preset": "npm run build && npm run emu:reset && npm run emu:start", "start": "npm run shell", "deploy": "firebase deploy --only functions", - "logs": "firebase functions:log" + "logs": "firebase functions:log", + "gcp-build": "npx puppeteer browsers install chrome" }, "engines": { "node": "18" diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 320251f..56670c3 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -36,16 +36,16 @@ export class CrawlerHost extends RPCHost { const formatted = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), - urlSource: snapshot.href.trim(), - markdownContent: contentText.trim(), + url: snapshot.href.trim(), + content: contentText.trim(), toString() { return `Title: ${this.title} -URL Source: ${this.urlSource} +URL Source: ${this.url} Markdown Content: -${contentText} +${this.content} `; } }; diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 7205d8c..ba70695 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -145,7 +145,7 @@ function giveSnapshot() { async *scrap(url: string, noCache: string | boolean = false) { const parsedUrl = new URL(url); - parsedUrl.search = ''; + // parsedUrl.search = ''; parsedUrl.hash = ''; const normalizedUrl = parsedUrl.toString().toLowerCase(); const digest = md5Hasher.hash(normalizedUrl); @@ -191,7 +191,17 @@ function giveSnapshot() { page.on('snapshot', hdl); const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 }) - .then(async (r) => { + .catch((err) => { + this.logger.warn(`Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) }); + return Promise.reject(new AssertionFailureError({ + message: `Failed to goto ${url}: ${err}`, + cause: err, + })); + }).finally(async () => { + finalized = true; + if (!snapshot?.html) { + return; + } screenshot = await page.screenshot({ type: 'jpeg', quality: 85, @@ -210,16 +220,6 @@ function giveSnapshot() { ).catch((err) => { this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) }); }); - - return r; - }).catch((err) => { - this.logger.warn(`Failed to goto ${url}`, { err: marshalErrorLike(err) }); - return Promise.reject(new AssertionFailureError({ - message: `Failed to goto ${url}: ${err}`, - cause: err, - })); - }).finally(() => { - finalized = true; }); try {