Commit Graph

226 Commits

Author SHA1 Message Date
Harsh Gupta
d380599986 strip more stuff 2024-08-14 13:47:25 +05:30
Harsh Gupta (aider)
bf27d39f1b fix: Replace estimateToken with a mock implementation 2024-08-14 13:45:02 +05:30
Harsh Gupta
aeb6ebed67 fix: Remove unnecessary SecurityCompromiseError import and usage
diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts
index ec00c1d..bb8ba1a 100644
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@@ -3,10 +3,10 @@ import {
     RPCHost, RPCReflection,
     HashManager,
     AssertionFailureError, ParamValidationError, Defer,
-    SecurityCompromiseError
+
 } from 'civkit';
 import { singleton } from 'tsyringe';
-import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
+import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
 import _ from 'lodash';
 import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
 import { Request, Response } from 'express';
@@ -660,7 +660,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
         if (!uid) {
             if (urlToCrawl.protocol === 'http:' && (!urlToCrawl.pathname || urlToCrawl.pathname === '/') &&
                 crawlerOptions.respondWith !== 'default') {
-                throw new SecurityCompromiseError(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
+                throw new Error(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
             }
             const blockade = (await DomainBlockade.fromFirestoreQuery(
                 DomainBlockade.COLLECTION
@@ -669,7 +669,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
                     .limit(1)
             ))[0];
             if (blockade) {
-                throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
+                throw new Error(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
             }

         }
@@ -940,7 +940,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;

             yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
         } catch (err: any) {
-            if (cache && !(err instanceof SecurityCompromiseError)) {
+            if (cache && !(err instanceof Error)) {
                 this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
                 yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
                 return;
2024-08-14 13:45:01 +05:30
Harsh Gupta (aider)
d15b721bfa refactor: Remove rate limiting from crawler.ts 2024-08-14 13:43:04 +05:30
Harsh Gupta
aa862d4247 fix: Refactor crawler.ts by removing unused imports and code 2024-08-14 13:43:03 +05:30
Yanlong Wang
df58fcb3fa
fix: alleviate search performance issue 2024-08-09 15:03:24 +08:00
Yanlong Wang
eb74e9c6f8
fix: remove select element from markdown to walk around turndown performance issue 2024-08-09 10:55:36 +08:00
Yanlong Wang
e4ef6cb0f9
chore: reduce fetch count in search 2024-08-09 10:29:50 +08:00
Yanlong Wang
e529369ba6
fix: search with failed pages 2024-08-08 15:49:23 +08:00
Yanlong Wang
0dd05b5dab
chore: tweak concurrency 2024-08-06 17:58:27 +08:00
Yanlong Wang
7af2bde01f
fix: html rebasing with <base> tag 2024-08-06 13:15:10 +08:00
Yanlong Wang
40e91853e2 fix 2024-08-02 20:10:17 +08:00
Yanlong Wang
cda0f371e1
feat: updated rate policy 2024-08-02 19:39:51 +08:00
Yanlong Wang
0a2c0932fd
fix 2024-08-02 17:13:50 +08:00
Yanlong Wang
ee632199df
fix 2024-08-02 17:12:10 +08:00
Yanlong Wang
0a33207f8f
fix: another approach to suspected DoS abuse 2024-08-02 17:04:13 +08:00
yanlong.wang
e658e8102c
fix 2024-08-01 20:07:39 +08:00
yanlong.wang
f4f189c8e6
fix 2024-08-01 19:51:53 +08:00
yanlong.wang
54fa5feb7f
fix 2024-08-01 19:49:40 +08:00
yanlong.wang
d0a922144d
fix 2024-08-01 19:48:00 +08:00
yanlong.wang
6fb5df97cc
fix: abuse of flooding elements 2024-08-01 19:34:39 +08:00
yanlong.wang
8b7af6d076
fix: ignore match all target selectors for performance 2024-07-31 14:06:22 +08:00
yanlong.wang
a08218506e
fix: truncate svg in jsdom 2024-07-31 13:12:57 +08:00
Yanlong Wang
4e5aff3332
debug: log jsdom and turndown operations 2024-07-31 11:12:12 +08:00
Yanlong Wang
0f239793d2
fix: also recover screenshot pricing 2024-07-30 20:11:55 +08:00
Yanlong Wang
d3f3a8502a
fix: revert screenshot behavior and introduce pageshot 2024-07-30 20:09:06 +08:00
yanlong.wang
57cbae864e
fix: jsdom, cache tolerance, screenshot pricing 2024-07-30 17:47:26 +08:00
yanlong.wang
77c8480ca6
feat: with-iframe and full-page screenshot 2024-07-30 15:08:09 +08:00
Yanlong Wang
e4d46e7acb
fix: count parameter 2024-07-25 19:46:28 +08:00
Yanlong Wang
1c4b64fe04
feat: bring your own html 2024-07-25 16:54:28 +08:00
Yanlong Wang
78ea13b101
fix 2024-07-25 15:14:36 +08:00
Yanlong Wang
9bcde30f11
fix 2024-07-25 14:56:19 +08:00
Yanlong Wang
09dbbd3b0f
fix: retry for brave search 2024-07-25 08:56:02 +08:00
Yanlong Wang
ec7c2ab52c fix: scrap timing 2024-07-24 22:59:00 +08:00
yanlong.wang
873994397a
fix: allow POST with url 2024-07-24 15:08:32 +08:00
yanlong.wang
786b1828b7
fix: brave search operators in headers 2024-07-23 15:50:28 +08:00
yanlong.wang
efe7a61e3b
fix: timeout parameter 2024-07-18 12:33:19 +08:00
yanlong.wang
61ff011c13
fix: stop early return when timeout is explicitly defined 2024-07-18 11:41:58 +08:00
yanlong.wang
d0e2920163
feat: expose brave search operators explicitly 2024-07-15 18:11:58 +08:00
Yanlong Wang
336931b5e8
chore: tweak deployment 2024-07-12 15:10:01 +08:00
yanlong.wang
c6634298c4
fix: provide empty content field 2024-07-09 18:14:19 +08:00
Yanlong Wang
f6a183f87a
fix 2024-07-06 01:00:37 +08:00
Yanlong Wang
c69ec77c60
fix 2024-07-06 00:59:37 +08:00
Yanlong Wang
c9ef602a94
fix: potential firebase blockade 2024-07-06 00:25:48 +08:00
yanlong.wang
1bcb5a742e
fix: dos abuse 2024-07-01 18:40:14 +08:00
Yanlong Wang
0a09aeb967
fix 2024-07-01 11:06:30 +08:00
Yanlong Wang
8951578ef1 chore 2024-07-01 00:29:12 +08:00
Yanlong Wang
858e52fe2e
chore: tweak cpu 2024-06-30 22:19:03 +08:00
yanlong.wang
62fb6cff94
feat: keepImgDataUrl 2024-06-24 15:18:53 +08:00
yanlong.wang
1084b16c84
fix: typo 2024-06-20 18:31:56 +08:00
yanlong.wang
579f259cb9
fix: detect when readability does not work 2024-06-20 18:20:13 +08:00
yanlong.wang
eaa06781e3
fix: normalize-url pollution 2024-06-20 14:53:25 +08:00
yanlong.wang
6f37e5d3b4
feat: x-remove-selector 2024-06-18 18:07:38 +08:00
yanlong.wang
ee008ebe10
fix: improved code rules 2024-06-13 16:27:30 +08:00
yanlong.wang
fd9a86bc00
chore: fix abuse timing 2024-06-11 13:57:19 +08:00
Yanlong Wang
70d80bbcfe
fix: abuse condition 2024-06-10 17:41:38 +08:00
Yanlong Wang
5789ae1407
chore: dont abuse our service 2024-06-10 17:23:50 +08:00
yanlong.wang
1e3bae6aad
fix: timeout parsing 2024-06-05 19:50:48 +08:00
yanlong.wang
a9936d322e
fix: search descriptions 2024-06-05 19:47:04 +08:00
yanlong.wang
165cce6c91
refactor: options dto 2024-06-05 18:55:40 +08:00
Yanlong Wang
f0668a96b4
fix: potential circular crawling 2024-06-02 23:23:39 +08:00
Yanlong Wang
be91371b93
fix: ignore blockade for authenticated users 2024-06-02 09:09:21 +08:00
Yanlong Wang
154d8ede45
fix: truncate svg 2024-06-02 08:57:39 +08:00
Yanlong Wang
7a7e49bc00
fix: blockade query 2024-06-01 08:06:46 +08:00
Yanlong Wang
d2bebec60f
fix: abuse blocker 2024-06-01 02:01:12 +08:00
Yanlong Wang
249408df6b
fix 2024-06-01 01:07:50 +08:00
Yanlong Wang
43dee08dcc
security: detect abuse 2024-06-01 00:57:51 +08:00
Yanlong Wang
908157b61e
fix: pdf cache 2024-05-31 19:05:17 +08:00
Yanlong Wang
9c60b4b93d
fix: setup expire for pdf caches 2024-05-31 18:36:23 +08:00
Yanlong Wang
1ba21da0c5
fix: pdf cache 2024-05-31 18:26:05 +08:00
Yanlong Wang
fd0b77285f
fix: firebase fail to save large docs 2024-05-31 18:16:37 +08:00
Yanlong Wang
964b66b6ab
fix: data crunching import 2024-05-31 17:32:16 +08:00
Yanlong Wang
9ac40606d5
fix: bulk fix multiple issues 2024-05-31 17:30:57 +08:00
Yanlong Wang
0c15946874
fix: trimstart url 2024-05-30 20:29:31 +08:00
Yanlong Wang
33e14e5404
feat: extract text from pdf (#70)
* feat: pdf

* fix

* fix
2024-05-30 20:21:33 +08:00
yanlong.wang
7c5712363c
feat: allow custom rate limit per uid 2024-05-23 15:36:09 +08:00
yanlong.wang
8eee95119d
feat: index brief in JSON format 2024-05-23 12:06:07 +08:00
yanlong.wang
4f37de24f6
fix: docs 2024-05-21 17:35:16 +08:00
Yanlong Wang
a8e0628460
feat: links and images summary (#63)
* wip: dedicated link and image summary

* fix

* fix

* fix

* fix: docs

* fix

* fix

* fix
2024-05-21 17:34:19 +08:00
Yanlong Wang
df71c9a534
fix: stop using pool 2024-05-20 01:12:22 +08:00
Yanlong Wang
4077fa7040
fix: geoip encoding 2024-05-17 09:31:22 +08:00
Yanlong Wang
2941be6096
fix: potential unencoded query 2024-05-17 09:15:37 +08:00
Yanlong Wang
ed9e9f43cf
fix: block rough requests 2024-05-16 20:22:26 +08:00
yanlong.wang
8ec8c1e718
fix: logging for search error 2024-05-16 19:01:30 +08:00
yanlong.wang
e0e37ad4d7
fix: potential chargeAmount mismatch 2024-05-16 18:43:41 +08:00
yanlong.wang
8b0916f858
fix: race condition while logging chargeAmount 2024-05-16 18:26:18 +08:00
yanlong.wang
6f4819bc49
chore: tweak deployment 2024-05-16 17:46:53 +08:00
yanlong.wang
322cb86f21
fix: on no results 2024-05-16 17:30:47 +08:00
yanlong.wang
e2698b48bd
fix: rate limit tag for search 2024-05-16 16:58:10 +08:00
yanlong.wang
72e1c46a6c
fix: improve search responsiveness 2024-05-16 15:47:49 +08:00
Yanlong Wang
0583645613
fix: noCache in search 2024-05-16 00:42:30 +08:00
Yanlong Wang
4556954d17
fix: image url 2024-05-16 00:39:24 +08:00
Yanlong Wang
6f65083f8d
feat: control cache tolerance and select target using headers 2024-05-16 00:10:20 +08:00
yanlong.wang
77fc500f41
fix: allow x-return-format header alias 2024-05-15 12:24:46 +08:00
Yanlong Wang
445624c405
fix: early return for search 2024-05-15 08:47:16 +08:00
Yanlong Wang
1cf8e83857
fix: add cache tolerance 2024-05-15 08:06:35 +08:00
Yanlong Wang
d100c3fc5f
fix: search result cache save 2024-05-14 19:57:49 +08:00
Yanlong Wang
ec4ce4fef3
chore: update rate limits 2024-05-14 19:44:35 +08:00
Yanlong Wang
2e3c217479
feat: web search (#57) 2024-05-14 19:39:43 +08:00
Yanlong Wang
f171e54ac9
fix: log charge amount 2024-05-14 17:25:59 +08:00