Harsh Gupta (aider)
|
a72373f815
|
fix: Add try-catch block to handle errors in salvage method
|
2024-08-14 16:04:57 +05:30 |
|
Harsh Gupta
|
888546e064
|
fix: Make the salvage method private in the puppeteer service
|
2024-08-14 16:04:56 +05:30 |
|
Harsh Gupta (aider)
|
ef138360c2
|
fix: Remove private modifier from salvage method
|
2024-08-14 16:01:00 +05:30 |
|
Harsh Gupta (aider)
|
f6f3fc5bea
|
fix: Improve error handling and add retry mechanism in PuppeteerControl
|
2024-08-14 15:49:13 +05:30 |
|
Harsh Gupta (aider)
|
a3a299fb38
|
fix: Implement retry mechanism and improve error handling for scraping function
|
2024-08-14 15:46:41 +05:30 |
|
Harsh Gupta
|
ddbf0030b4
|
fix the logger thingy
|
2024-08-14 15:35:20 +05:30 |
|
Harsh Gupta (aider)
|
a3f222638e
|
feat: Add shared module dependencies and exports
|
2024-08-14 15:15:07 +05:30 |
|
Harsh Gupta (aider)
|
02abc2aaaa
|
fix: Register Logger class with dependency injection container
|
2024-08-14 15:11:14 +05:30 |
|
Harsh Gupta
|
2d6447e8fc
|
add mock shared libraries
|
2024-08-14 14:53:52 +05:30 |
|
Harsh Gupta
|
88a6bd7131
|
remove submodule shared
|
2024-08-14 14:53:22 +05:30 |
|
Harsh Gupta
|
cbe4fa94c1
|
remove alt-text service
|
2024-08-14 14:46:37 +05:30 |
|
Harsh Gupta
|
4c957adbce
|
remove PDF extraction functionality
|
2024-08-14 14:44:01 +05:30 |
|
Harsh Gupta
|
db6cd7d76c
|
fixes
|
2024-08-14 14:41:02 +05:30 |
|
Harsh Gupta (aider)
|
e9ac98a628
|
fix: Remove new keyword when using RPCReflection
|
2024-08-14 14:39:32 +05:30 |
|
Harsh Gupta
|
4e9b6b7ca5
|
fix: Update type annotations for mixins and suffixMixins arrays in crawler.ts
|
2024-08-14 14:39:31 +05:30 |
|
Harsh Gupta
|
87d9f772c1
|
more fixes
|
2024-08-14 14:35:07 +05:30 |
|
Harsh Gupta (aider)
|
2343c1d28b
|
feat: Modify crawler.ts and index.ts to make crawl function usable as a Firebase function
|
2024-08-14 14:30:08 +05:30 |
|
Harsh Gupta
|
80547abf38
|
fix: Remove unused code and dependencies
|
2024-08-14 14:30:07 +05:30 |
|
Harsh Gupta (aider)
|
c33929afb2
|
refactor: remove usage of cache
|
2024-08-14 13:51:35 +05:30 |
|
Harsh Gupta
|
127c32abc9
|
fix: Remove unnecessary code for calculating charge amount
|
2024-08-14 13:51:33 +05:30 |
|
Harsh Gupta (aider)
|
6804b99533
|
fix: Remove billing and related flow
|
2024-08-14 13:49:46 +05:30 |
|
Harsh Gupta
|
6c17175c43
|
fix: Remove unused getChargeAmount function
|
2024-08-14 13:49:44 +05:30 |
|
Harsh Gupta (aider)
|
fbdc266660
|
fix: Remove auth and user info
|
2024-08-14 13:48:29 +05:30 |
|
Harsh Gupta
|
d380599986
|
strip more stuff
|
2024-08-14 13:47:25 +05:30 |
|
Harsh Gupta (aider)
|
bf27d39f1b
|
fix: Replace estimateToken with a mock implementation
|
2024-08-14 13:45:02 +05:30 |
|
Harsh Gupta
|
aeb6ebed67
|
fix: Remove unnecessary SecurityCompromiseError import and usage
diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts
index ec00c1d..bb8ba1a 100644
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@@ -3,10 +3,10 @@ import {
RPCHost, RPCReflection,
HashManager,
AssertionFailureError, ParamValidationError, Defer,
- SecurityCompromiseError
+
} from 'civkit';
import { singleton } from 'tsyringe';
-import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
+import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
@@ -660,7 +660,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
if (!uid) {
if (urlToCrawl.protocol === 'http:' && (!urlToCrawl.pathname || urlToCrawl.pathname === '/') &&
crawlerOptions.respondWith !== 'default') {
- throw new SecurityCompromiseError(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
+ throw new Error(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
}
const blockade = (await DomainBlockade.fromFirestoreQuery(
DomainBlockade.COLLECTION
@@ -669,7 +669,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
.limit(1)
))[0];
if (blockade) {
- throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
+ throw new Error(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
}
}
@@ -940,7 +940,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
} catch (err: any) {
- if (cache && !(err instanceof SecurityCompromiseError)) {
+ if (cache && !(err instanceof Error)) {
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
return;
|
2024-08-14 13:45:01 +05:30 |
|
Harsh Gupta (aider)
|
d15b721bfa
|
refactor: Remove rate limiting from crawler.ts
|
2024-08-14 13:43:04 +05:30 |
|
Harsh Gupta
|
aa862d4247
|
fix: Refactor crawler.ts by removing unused imports and code
|
2024-08-14 13:43:03 +05:30 |
|
Yanlong Wang
|
df58fcb3fa
|
fix: alleviate search performance issue
|
2024-08-09 15:03:24 +08:00 |
|
Yanlong Wang
|
eb74e9c6f8
|
fix: remove select element from markdown to walk around turndown performance issue
|
2024-08-09 10:55:36 +08:00 |
|
Yanlong Wang
|
e4ef6cb0f9
|
chore: reduce fetch count in search
|
2024-08-09 10:29:50 +08:00 |
|
Yanlong Wang
|
e529369ba6
|
fix: search with failed pages
|
2024-08-08 15:49:23 +08:00 |
|
Yanlong Wang
|
0dd05b5dab
|
chore: tweak concurrency
|
2024-08-06 17:58:27 +08:00 |
|
Yanlong Wang
|
7af2bde01f
|
fix: html rebasing with <base> tag
|
2024-08-06 13:15:10 +08:00 |
|
Yanlong Wang
|
40e91853e2
|
fix
|
2024-08-02 20:10:17 +08:00 |
|
Yanlong Wang
|
cda0f371e1
|
feat: updated rate policy
|
2024-08-02 19:39:51 +08:00 |
|
Yanlong Wang
|
0a2c0932fd
|
fix
|
2024-08-02 17:13:50 +08:00 |
|
Yanlong Wang
|
ee632199df
|
fix
|
2024-08-02 17:12:10 +08:00 |
|
Yanlong Wang
|
0a33207f8f
|
fix: another approach to suspected DoS abuse
|
2024-08-02 17:04:13 +08:00 |
|
yanlong.wang
|
e658e8102c
|
fix
|
2024-08-01 20:07:39 +08:00 |
|
yanlong.wang
|
f4f189c8e6
|
fix
|
2024-08-01 19:51:53 +08:00 |
|
yanlong.wang
|
54fa5feb7f
|
fix
|
2024-08-01 19:49:40 +08:00 |
|
yanlong.wang
|
d0a922144d
|
fix
|
2024-08-01 19:48:00 +08:00 |
|
yanlong.wang
|
6fb5df97cc
|
fix: abuse of flooding elements
|
2024-08-01 19:34:39 +08:00 |
|
yanlong.wang
|
8b7af6d076
|
fix: ignore match all target selectors for performance
|
2024-07-31 14:06:22 +08:00 |
|
yanlong.wang
|
a08218506e
|
fix: truncate svg in jsdom
|
2024-07-31 13:12:57 +08:00 |
|
Yanlong Wang
|
4e5aff3332
|
debug: log jsdom and turndown operations
|
2024-07-31 11:12:12 +08:00 |
|
Yanlong Wang
|
0f239793d2
|
fix: also recover screenshot pricing
|
2024-07-30 20:11:55 +08:00 |
|
Yanlong Wang
|
d3f3a8502a
|
fix: revert screenshot behavior and introduce pageshot
|
2024-07-30 20:09:06 +08:00 |
|
yanlong.wang
|
57cbae864e
|
fix: jsdom, cache tolerance, screenshot pricing
|
2024-07-30 17:47:26 +08:00 |
|