fix: Refactor crawler.ts by removing unused imports and code

This commit is contained in:
Harsh Gupta 2024-08-14 13:43:03 +05:30 committed by Harsh Gupta (aider)
parent df58fcb3fa
commit aa862d4247

View File

@ -5,8 +5,7 @@ import {
AssertionFailureError, ParamValidationError, Defer, AssertionFailureError, ParamValidationError, Defer,
} from 'civkit'; } from 'civkit';
import { singleton } from 'tsyringe'; import { singleton } from 'tsyringe';
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared'; import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import _ from 'lodash'; import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express'; import { Request, Response } from 'express';
@ -16,11 +15,10 @@ import TurndownService from 'turndown';
import { Crawled } from '../db/crawled'; import { Crawled } from '../db/crawled';
import { cleanAttribute } from '../utils/misc'; import { cleanAttribute } from '../utils/misc';
import { randomUUID } from 'crypto'; import { randomUUID } from 'crypto';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { countGPTToken as estimateToken } from '../shared/utils/openai'; import { countGPTToken as estimateToken } from '../shared/utils/openai';
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options'; import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { PDFExtractor } from '../services/pdf-extract'; import { PDFExtractor } from '../services/pdf-extract';
import { DomainBlockade } from '../db/domain-blockade'; import { DomainBlockade } from '../db/domain-blockade';
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker'; import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
@ -81,7 +79,6 @@ export class CrawlerHost extends RPCHost {
protected altTextService: AltTextService, protected altTextService: AltTextService,
protected pdfExtractor: PDFExtractor, protected pdfExtractor: PDFExtractor,
protected firebaseObjectStorage: FirebaseStorageBucketControl, protected firebaseObjectStorage: FirebaseStorageBucketControl,
protected rateLimitControl: RateLimitControl,
protected threadLocal: AsyncContext, protected threadLocal: AsyncContext,
protected fbHealthCheck: FirebaseRoundTripChecker, protected fbHealthCheck: FirebaseRoundTripChecker,
) { ) {
@ -121,7 +118,7 @@ export class CrawlerHost extends RPCHost {
this.emit('ready'); this.emit('ready');
} }
getIndex(user?: JinaEmbeddingsTokenAccount) { getIndex() {
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto); const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
Object.assign(indexObject, { Object.assign(indexObject, {
@ -131,12 +128,6 @@ export class CrawlerHost extends RPCHost {
sourceCode: 'https://github.com/jina-ai/reader', sourceCode: 'https://github.com/jina-ai/reader',
}); });
if (user) {
indexObject[''] = undefined;
indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
indexObject.balanceLeft = user.wallet.total_balance;
}
return indexObject; return indexObject;
} }
@ -375,7 +366,6 @@ export class CrawlerHost extends RPCHost {
let contentText = ''; let contentText = '';
const imageSummary = {} as { [k: string]: string; }; const imageSummary = {} as { [k: string]: string; };
const imageIdxTrack = new Map<string, number[]>(); const imageIdxTrack = new Map<string, number[]>();
const uid = this.threadLocal.get('uid');
do { do {
if (pdfMode) { if (pdfMode) {
contentText = snapshot.parsed?.content || snapshot.text; contentText = snapshot.parsed?.content || snapshot.text;
@ -384,7 +374,6 @@ export class CrawlerHost extends RPCHost {
if ( if (
snapshot.maxElemDepth! > 256 || snapshot.maxElemDepth! > 256 ||
(!uid && snapshot.elemCount! > 10_000) ||
snapshot.elemCount! > 70_000 snapshot.elemCount! > 70_000
) { ) {
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href }); this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });