mirror of
https://github.com/intergalacticalvariable/reader.git
synced 2024-11-16 11:42:32 +08:00
fix: Refactor crawler.ts by removing unused imports and code
This commit is contained in:
parent
df58fcb3fa
commit
aa862d4247
|
@ -5,8 +5,7 @@ import {
|
||||||
AssertionFailureError, ParamValidationError, Defer,
|
AssertionFailureError, ParamValidationError, Defer,
|
||||||
} from 'civkit';
|
} from 'civkit';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
||||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||||
import { Request, Response } from 'express';
|
import { Request, Response } from 'express';
|
||||||
|
@ -16,11 +15,10 @@ import TurndownService from 'turndown';
|
||||||
import { Crawled } from '../db/crawled';
|
import { Crawled } from '../db/crawled';
|
||||||
import { cleanAttribute } from '../utils/misc';
|
import { cleanAttribute } from '../utils/misc';
|
||||||
import { randomUUID } from 'crypto';
|
import { randomUUID } from 'crypto';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
|
||||||
|
|
||||||
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||||
|
|
||||||
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
||||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
|
||||||
import { PDFExtractor } from '../services/pdf-extract';
|
import { PDFExtractor } from '../services/pdf-extract';
|
||||||
import { DomainBlockade } from '../db/domain-blockade';
|
import { DomainBlockade } from '../db/domain-blockade';
|
||||||
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
||||||
|
@ -81,7 +79,6 @@ export class CrawlerHost extends RPCHost {
|
||||||
protected altTextService: AltTextService,
|
protected altTextService: AltTextService,
|
||||||
protected pdfExtractor: PDFExtractor,
|
protected pdfExtractor: PDFExtractor,
|
||||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||||
protected rateLimitControl: RateLimitControl,
|
|
||||||
protected threadLocal: AsyncContext,
|
protected threadLocal: AsyncContext,
|
||||||
protected fbHealthCheck: FirebaseRoundTripChecker,
|
protected fbHealthCheck: FirebaseRoundTripChecker,
|
||||||
) {
|
) {
|
||||||
|
@ -121,7 +118,7 @@ export class CrawlerHost extends RPCHost {
|
||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
getIndex(user?: JinaEmbeddingsTokenAccount) {
|
getIndex() {
|
||||||
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
|
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
|
||||||
|
|
||||||
Object.assign(indexObject, {
|
Object.assign(indexObject, {
|
||||||
|
@ -131,12 +128,6 @@ export class CrawlerHost extends RPCHost {
|
||||||
sourceCode: 'https://github.com/jina-ai/reader',
|
sourceCode: 'https://github.com/jina-ai/reader',
|
||||||
});
|
});
|
||||||
|
|
||||||
if (user) {
|
|
||||||
indexObject[''] = undefined;
|
|
||||||
indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
|
|
||||||
indexObject.balanceLeft = user.wallet.total_balance;
|
|
||||||
}
|
|
||||||
|
|
||||||
return indexObject;
|
return indexObject;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -375,7 +366,6 @@ export class CrawlerHost extends RPCHost {
|
||||||
let contentText = '';
|
let contentText = '';
|
||||||
const imageSummary = {} as { [k: string]: string; };
|
const imageSummary = {} as { [k: string]: string; };
|
||||||
const imageIdxTrack = new Map<string, number[]>();
|
const imageIdxTrack = new Map<string, number[]>();
|
||||||
const uid = this.threadLocal.get('uid');
|
|
||||||
do {
|
do {
|
||||||
if (pdfMode) {
|
if (pdfMode) {
|
||||||
contentText = snapshot.parsed?.content || snapshot.text;
|
contentText = snapshot.parsed?.content || snapshot.text;
|
||||||
|
@ -384,7 +374,6 @@ export class CrawlerHost extends RPCHost {
|
||||||
|
|
||||||
if (
|
if (
|
||||||
snapshot.maxElemDepth! > 256 ||
|
snapshot.maxElemDepth! > 256 ||
|
||||||
(!uid && snapshot.elemCount! > 10_000) ||
|
|
||||||
snapshot.elemCount! > 70_000
|
snapshot.elemCount! > 70_000
|
||||||
) {
|
) {
|
||||||
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
|
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
|
||||||
|
|
Loading…
Reference in New Issue
Block a user