Harsh Gupta
|
d380599986
|
strip more stuff
|
2024-08-14 13:47:25 +05:30 |
|
Harsh Gupta (aider)
|
bf27d39f1b
|
fix: Replace estimateToken with a mock implementation
|
2024-08-14 13:45:02 +05:30 |
|
Harsh Gupta
|
aeb6ebed67
|
fix: Remove unnecessary SecurityCompromiseError import and usage
diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts
index ec00c1d..bb8ba1a 100644
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@@ -3,10 +3,10 @@ import {
RPCHost, RPCReflection,
HashManager,
AssertionFailureError, ParamValidationError, Defer,
- SecurityCompromiseError
+
} from 'civkit';
import { singleton } from 'tsyringe';
-import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
+import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
@@ -660,7 +660,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
if (!uid) {
if (urlToCrawl.protocol === 'http:' && (!urlToCrawl.pathname || urlToCrawl.pathname === '/') &&
crawlerOptions.respondWith !== 'default') {
- throw new SecurityCompromiseError(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
+ throw new Error(`Your request is categorized as abuse. Please don't abuse our service. If you are sure you are not abusing, please authenticate yourself with an API key.`);
}
const blockade = (await DomainBlockade.fromFirestoreQuery(
DomainBlockade.COLLECTION
@@ -669,7 +669,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
.limit(1)
))[0];
if (blockade) {
- throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
+ throw new Error(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
}
}
@@ -940,7 +940,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
} catch (err: any) {
- if (cache && !(err instanceof SecurityCompromiseError)) {
+ if (cache && !(err instanceof Error)) {
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
return;
|
2024-08-14 13:45:01 +05:30 |
|
Harsh Gupta (aider)
|
d15b721bfa
|
refactor: Remove rate limiting from crawler.ts
|
2024-08-14 13:43:04 +05:30 |
|
Harsh Gupta
|
aa862d4247
|
fix: Refactor crawler.ts by removing unused imports and code
|
2024-08-14 13:43:03 +05:30 |
|
Yanlong Wang
|
df58fcb3fa
|
fix: alleviate search performance issue
|
2024-08-09 15:03:24 +08:00 |
|
Yanlong Wang
|
eb74e9c6f8
|
fix: remove select element from markdown to walk around turndown performance issue
|
2024-08-09 10:55:36 +08:00 |
|
Yanlong Wang
|
e4ef6cb0f9
|
chore: reduce fetch count in search
|
2024-08-09 10:29:50 +08:00 |
|
Yanlong Wang
|
e529369ba6
|
fix: search with failed pages
|
2024-08-08 15:49:23 +08:00 |
|
Yanlong Wang
|
0dd05b5dab
|
chore: tweak concurrency
|
2024-08-06 17:58:27 +08:00 |
|
Yanlong Wang
|
7af2bde01f
|
fix: html rebasing with <base> tag
|
2024-08-06 13:15:10 +08:00 |
|
Yanlong Wang
|
40e91853e2
|
fix
|
2024-08-02 20:10:17 +08:00 |
|
Yanlong Wang
|
cda0f371e1
|
feat: updated rate policy
|
2024-08-02 19:39:51 +08:00 |
|
Yanlong Wang
|
0a2c0932fd
|
fix
|
2024-08-02 17:13:50 +08:00 |
|
Yanlong Wang
|
ee632199df
|
fix
|
2024-08-02 17:12:10 +08:00 |
|
Yanlong Wang
|
0a33207f8f
|
fix: another approach to suspected DoS abuse
|
2024-08-02 17:04:13 +08:00 |
|
yanlong.wang
|
e658e8102c
|
fix
|
2024-08-01 20:07:39 +08:00 |
|
yanlong.wang
|
f4f189c8e6
|
fix
|
2024-08-01 19:51:53 +08:00 |
|
yanlong.wang
|
54fa5feb7f
|
fix
|
2024-08-01 19:49:40 +08:00 |
|
yanlong.wang
|
d0a922144d
|
fix
|
2024-08-01 19:48:00 +08:00 |
|
yanlong.wang
|
6fb5df97cc
|
fix: abuse of flooding elements
|
2024-08-01 19:34:39 +08:00 |
|
yanlong.wang
|
8b7af6d076
|
fix: ignore match all target selectors for performance
|
2024-07-31 14:06:22 +08:00 |
|
yanlong.wang
|
a08218506e
|
fix: truncate svg in jsdom
|
2024-07-31 13:12:57 +08:00 |
|
Yanlong Wang
|
4e5aff3332
|
debug: log jsdom and turndown operations
|
2024-07-31 11:12:12 +08:00 |
|
Yanlong Wang
|
0f239793d2
|
fix: also recover screenshot pricing
|
2024-07-30 20:11:55 +08:00 |
|
Yanlong Wang
|
d3f3a8502a
|
fix: revert screenshot behavior and introduce pageshot
|
2024-07-30 20:09:06 +08:00 |
|
yanlong.wang
|
57cbae864e
|
fix: jsdom, cache tolerance, screenshot pricing
|
2024-07-30 17:47:26 +08:00 |
|
yanlong.wang
|
77c8480ca6
|
feat: with-iframe and full-page screenshot
|
2024-07-30 15:08:09 +08:00 |
|
Yanlong Wang
|
e4d46e7acb
|
fix: count parameter
|
2024-07-25 19:46:28 +08:00 |
|
Yanlong Wang
|
1c4b64fe04
|
feat: bring your own html
|
2024-07-25 16:54:28 +08:00 |
|
Yanlong Wang
|
78ea13b101
|
fix
|
2024-07-25 15:14:36 +08:00 |
|
Yanlong Wang
|
9bcde30f11
|
fix
|
2024-07-25 14:56:19 +08:00 |
|
Yanlong Wang
|
09dbbd3b0f
|
fix: retry for brave search
|
2024-07-25 08:56:02 +08:00 |
|
Yanlong Wang
|
ec7c2ab52c
|
fix: scrap timing
|
2024-07-24 22:59:00 +08:00 |
|
yanlong.wang
|
873994397a
|
fix: allow POST with url
|
2024-07-24 15:08:32 +08:00 |
|
yanlong.wang
|
786b1828b7
|
fix: brave search operators in headers
|
2024-07-23 15:50:28 +08:00 |
|
yanlong.wang
|
efe7a61e3b
|
fix: timeout parameter
|
2024-07-18 12:33:19 +08:00 |
|
yanlong.wang
|
61ff011c13
|
fix: stop early return when timeout is explicitly defined
|
2024-07-18 11:41:58 +08:00 |
|
yanlong.wang
|
d0e2920163
|
feat: expose brave search operators explicitly
|
2024-07-15 18:11:58 +08:00 |
|
Yanlong Wang
|
336931b5e8
|
chore: tweak deployment
|
2024-07-12 15:10:01 +08:00 |
|
yanlong.wang
|
c6634298c4
|
fix: provide empty content field
|
2024-07-09 18:14:19 +08:00 |
|
Yanlong Wang
|
f6a183f87a
|
fix
|
2024-07-06 01:00:37 +08:00 |
|
Yanlong Wang
|
c69ec77c60
|
fix
|
2024-07-06 00:59:37 +08:00 |
|
Yanlong Wang
|
c9ef602a94
|
fix: potential firebase blockade
|
2024-07-06 00:25:48 +08:00 |
|
yanlong.wang
|
1bcb5a742e
|
fix: dos abuse
|
2024-07-01 18:40:14 +08:00 |
|
Yanlong Wang
|
0a09aeb967
|
fix
|
2024-07-01 11:06:30 +08:00 |
|
Yanlong Wang
|
8951578ef1
|
chore
|
2024-07-01 00:29:12 +08:00 |
|
Yanlong Wang
|
858e52fe2e
|
chore: tweak cpu
|
2024-06-30 22:19:03 +08:00 |
|
yanlong.wang
|
62fb6cff94
|
feat: keepImgDataUrl
|
2024-06-24 15:18:53 +08:00 |
|
yanlong.wang
|
1084b16c84
|
fix: typo
|
2024-06-20 18:31:56 +08:00 |
|
yanlong.wang
|
579f259cb9
|
fix: detect when readability does not work
|
2024-06-20 18:20:13 +08:00 |
|
yanlong.wang
|
eaa06781e3
|
fix: normalize-url pollution
|
2024-06-20 14:53:25 +08:00 |
|
yanlong.wang
|
6f37e5d3b4
|
feat: x-remove-selector
|
2024-06-18 18:07:38 +08:00 |
|
yanlong.wang
|
ee008ebe10
|
fix: improved code rules
|
2024-06-13 16:27:30 +08:00 |
|
yanlong.wang
|
fd9a86bc00
|
chore: fix abuse timing
|
2024-06-11 13:57:19 +08:00 |
|
Yanlong Wang
|
70d80bbcfe
|
fix: abuse condition
|
2024-06-10 17:41:38 +08:00 |
|
Yanlong Wang
|
5789ae1407
|
chore: dont abuse our service
|
2024-06-10 17:23:50 +08:00 |
|
yanlong.wang
|
1e3bae6aad
|
fix: timeout parsing
|
2024-06-05 19:50:48 +08:00 |
|
yanlong.wang
|
a9936d322e
|
fix: search descriptions
|
2024-06-05 19:47:04 +08:00 |
|
yanlong.wang
|
165cce6c91
|
refactor: options dto
|
2024-06-05 18:55:40 +08:00 |
|
Yanlong Wang
|
f0668a96b4
|
fix: potential circular crawling
|
2024-06-02 23:23:39 +08:00 |
|
Yanlong Wang
|
be91371b93
|
fix: ignore blockade for authenticated users
|
2024-06-02 09:09:21 +08:00 |
|
Yanlong Wang
|
154d8ede45
|
fix: truncate svg
|
2024-06-02 08:57:39 +08:00 |
|
Yanlong Wang
|
7a7e49bc00
|
fix: blockade query
|
2024-06-01 08:06:46 +08:00 |
|
Yanlong Wang
|
d2bebec60f
|
fix: abuse blocker
|
2024-06-01 02:01:12 +08:00 |
|
Yanlong Wang
|
249408df6b
|
fix
|
2024-06-01 01:07:50 +08:00 |
|
Yanlong Wang
|
43dee08dcc
|
security: detect abuse
|
2024-06-01 00:57:51 +08:00 |
|
Yanlong Wang
|
908157b61e
|
fix: pdf cache
|
2024-05-31 19:05:17 +08:00 |
|
Yanlong Wang
|
9c60b4b93d
|
fix: setup expire for pdf caches
|
2024-05-31 18:36:23 +08:00 |
|
Yanlong Wang
|
1ba21da0c5
|
fix: pdf cache
|
2024-05-31 18:26:05 +08:00 |
|
Yanlong Wang
|
fd0b77285f
|
fix: firebase fail to save large docs
|
2024-05-31 18:16:37 +08:00 |
|
Yanlong Wang
|
964b66b6ab
|
fix: data crunching import
|
2024-05-31 17:32:16 +08:00 |
|
Yanlong Wang
|
9ac40606d5
|
fix: bulk fix multiple issues
|
2024-05-31 17:30:57 +08:00 |
|
Yanlong Wang
|
0c15946874
|
fix: trimstart url
|
2024-05-30 20:29:31 +08:00 |
|
Yanlong Wang
|
33e14e5404
|
feat: extract text from pdf (#70)
* feat: pdf
* fix
* fix
|
2024-05-30 20:21:33 +08:00 |
|
yanlong.wang
|
7c5712363c
|
feat: allow custom rate limit per uid
|
2024-05-23 15:36:09 +08:00 |
|
yanlong.wang
|
8eee95119d
|
feat: index brief in JSON format
|
2024-05-23 12:06:07 +08:00 |
|
yanlong.wang
|
4f37de24f6
|
fix: docs
|
2024-05-21 17:35:16 +08:00 |
|
Yanlong Wang
|
a8e0628460
|
feat: links and images summary (#63)
* wip: dedicated link and image summary
* fix
* fix
* fix
* fix: docs
* fix
* fix
* fix
|
2024-05-21 17:34:19 +08:00 |
|
Yanlong Wang
|
df71c9a534
|
fix: stop using pool
|
2024-05-20 01:12:22 +08:00 |
|
Yanlong Wang
|
4077fa7040
|
fix: geoip encoding
|
2024-05-17 09:31:22 +08:00 |
|
Yanlong Wang
|
2941be6096
|
fix: potential unencoded query
|
2024-05-17 09:15:37 +08:00 |
|
Yanlong Wang
|
ed9e9f43cf
|
fix: block rough requests
|
2024-05-16 20:22:26 +08:00 |
|
yanlong.wang
|
8ec8c1e718
|
fix: logging for search error
|
2024-05-16 19:01:30 +08:00 |
|
yanlong.wang
|
e0e37ad4d7
|
fix: potential chargeAmount mismatch
|
2024-05-16 18:43:41 +08:00 |
|
yanlong.wang
|
8b0916f858
|
fix: race condition while logging chargeAmount
|
2024-05-16 18:26:18 +08:00 |
|
yanlong.wang
|
6f4819bc49
|
chore: tweak deployment
|
2024-05-16 17:46:53 +08:00 |
|
yanlong.wang
|
322cb86f21
|
fix: on no results
|
2024-05-16 17:30:47 +08:00 |
|
yanlong.wang
|
e2698b48bd
|
fix: rate limit tag for search
|
2024-05-16 16:58:10 +08:00 |
|
yanlong.wang
|
72e1c46a6c
|
fix: improve search responsiveness
|
2024-05-16 15:47:49 +08:00 |
|
Yanlong Wang
|
0583645613
|
fix: noCache in search
|
2024-05-16 00:42:30 +08:00 |
|
Yanlong Wang
|
4556954d17
|
fix: image url
|
2024-05-16 00:39:24 +08:00 |
|
Yanlong Wang
|
6f65083f8d
|
feat: control cache tolerance and select target using headers
|
2024-05-16 00:10:20 +08:00 |
|
yanlong.wang
|
77fc500f41
|
fix: allow x-return-format header alias
|
2024-05-15 12:24:46 +08:00 |
|
Yanlong Wang
|
445624c405
|
fix: early return for search
|
2024-05-15 08:47:16 +08:00 |
|
Yanlong Wang
|
1cf8e83857
|
fix: add cache tolerance
|
2024-05-15 08:06:35 +08:00 |
|
Yanlong Wang
|
d100c3fc5f
|
fix: search result cache save
|
2024-05-14 19:57:49 +08:00 |
|
Yanlong Wang
|
ec4ce4fef3
|
chore: update rate limits
|
2024-05-14 19:44:35 +08:00 |
|
Yanlong Wang
|
2e3c217479
|
feat: web search (#57)
|
2024-05-14 19:39:43 +08:00 |
|
Yanlong Wang
|
f171e54ac9
|
fix: log charge amount
|
2024-05-14 17:25:59 +08:00 |
|