dify/web/models/datasets.ts

577 lines
11 KiB
TypeScript

import type { DataSourceNotionPage, DataSourceProvider } from './common'
import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
import type { Tag } from '@/app/components/base/tag-management/constant'
export enum DataSourceType {
FILE = 'upload_file',
NOTION = 'notion_import',
WEB = 'website_crawl',
}
export type DatasetPermission = 'only_me' | 'all_team_members' | 'partial_members'
export type DataSet = {
id: string
name: string
icon: string
icon_background: string
description: string
permission: DatasetPermission
data_source_type: DataSourceType
indexing_technique: 'high_quality' | 'economy'
created_by: string
updated_by: string
updated_at: number
app_count: number
document_count: number
word_count: number
provider: string
embedding_model: string
embedding_model_provider: string
embedding_available: boolean
retrieval_model_dict: RetrievalConfig
retrieval_model: RetrievalConfig
tags: Tag[]
partial_member_list?: any[]
external_knowledge_info: {
external_knowledge_id: string
external_knowledge_api_id: string
external_knowledge_api_name: string
external_knowledge_api_endpoint: string
}
external_retrieval_model: {
top_k: number
score_threshold: number
score_threshold_enabled: boolean
}
}
export type ExternalAPIItem = {
id: string
tenant_id: string
name: string
description: string
settings: {
endpoint: string
api_key: string
}
dataset_bindings: { id: string; name: string }[]
created_by: string
created_at: string
}
export type ExternalKnowledgeItem = {
id: string
name: string
description: string | null
provider: 'external'
permission: DatasetPermission
data_source_type: null
indexing_technique: null
app_count: number
document_count: number
word_count: number
created_by: string
created_at: string
updated_by: string
updated_at: string
tags: Tag[]
}
export type ExternalAPIDeleteResponse = {
result: 'success' | 'error'
}
export type ExternalAPIUsage = {
is_using: boolean
count: number
}
export type CustomFile = File & {
id?: string
extension?: string
mime_type?: string
created_by?: string
created_at?: number
}
export type CrawlOptions = {
crawl_sub_pages: boolean
only_main_content: boolean
includes: string
excludes: string
limit: number | string
max_depth: number | string
use_sitemap: boolean
}
export type CrawlResultItem = {
title: string
markdown: string
description: string
source_url: string
}
export type FileItem = {
fileID: string
file: CustomFile
progress: number
}
export type DataSetListResponse = {
data: DataSet[]
has_more: boolean
limit: number
page: number
total: number
}
export type ExternalAPIListResponse = {
data: ExternalAPIItem[]
has_more: boolean
limit: number
page: number
total: number
}
export type QA = {
question: string
answer: string
}
export type IndexingEstimateResponse = {
tokens: number
total_price: number
currency: string
total_segments: number
preview: string[]
qa_preview?: QA[]
}
export type FileIndexingEstimateResponse = {
total_nodes: number
} & IndexingEstimateResponse
export type IndexingStatusResponse = {
id: string
indexing_status: DocumentIndexingStatus
processing_started_at: number
parsing_completed_at: number
cleaning_completed_at: number
splitting_completed_at: number
completed_at: any
paused_at: any
error: any
stopped_at: any
completed_segments: number
total_segments: number
}
export type IndexingStatusBatchResponse = {
data: IndexingStatusResponse[]
}
export type ProcessMode = 'automatic' | 'custom'
export type ProcessRuleResponse = {
mode: ProcessMode
rules: Rules
}
export type Rules = {
pre_processing_rules: PreProcessingRule[]
segmentation: Segmentation
}
export type PreProcessingRule = {
id: string
enabled: boolean
}
export type Segmentation = {
separator: string
max_tokens: number
chunk_overlap: number
}
export const DocumentIndexingStatusList = [
'waiting',
'parsing',
'cleaning',
'splitting',
'indexing',
'paused',
'error',
'completed',
] as const
export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
export const DisplayStatusList = [
'queuing',
'indexing',
'paused',
'error',
'available',
'enabled',
'disabled',
'archived',
] as const
export type DocumentDisplayStatus = typeof DisplayStatusList[number]
export type DataSourceInfo = {
upload_file: {
id: string
name: string
size: number
mime_type: string
created_at: number
created_by: string
extension: string
}
notion_page_icon?: string
notion_workspace_id?: string
notion_page_id?: string
provider?: DataSourceProvider
job_id: string
url: string
}
export type InitialDocumentDetail = {
id: string
batch: string
position: number
dataset_id: string
data_source_type: DataSourceType
data_source_info: DataSourceInfo
dataset_process_rule_id: string
name: string
created_from: 'api' | 'web'
created_by: string
created_at: number
indexing_status: DocumentIndexingStatus
display_status: DocumentDisplayStatus
completed_segments?: number
total_segments?: number
doc_form: 'text_model' | 'qa_model'
doc_language: string
}
export type SimpleDocumentDetail = InitialDocumentDetail & {
enabled: boolean
word_count: number
error?: string | null
archived: boolean
updated_at: number
hit_count: number
dataset_process_rule_id?: string
data_source_detail_dict?: {
upload_file: {
name: string
extension: string
}
}
}
export type DocumentListResponse = {
data: SimpleDocumentDetail[]
has_more: boolean
total: number
page: number
limit: number
}
export type DocumentReq = {
original_document_id?: string
indexing_technique?: string
doc_form: 'text_model' | 'qa_model'
doc_language: string
process_rule: ProcessRule
}
export type CreateDocumentReq = DocumentReq & {
data_source: DataSource
retrieval_model: RetrievalConfig
embedding_model: string
embedding_model_provider: string
}
export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
dataset_id: string
}
export type DataSource = {
type: DataSourceType
info_list: {
data_source_type: DataSourceType
notion_info_list?: NotionInfo[]
file_info_list?: {
file_ids: string[]
}
website_info_list?: {
provider: string
job_id: string
urls: string[]
}
}
}
export type NotionInfo = {
workspace_id: string
pages: DataSourceNotionPage[]
}
export type NotionPage = {
page_id: string
type: string
}
export type ProcessRule = {
mode: string
rules: Rules
}
export type createDocumentResponse = {
dataset?: DataSet
batch: string
documents: InitialDocumentDetail[]
}
export type FullDocumentDetail = SimpleDocumentDetail & {
batch: string
created_api_request_id: string
processing_started_at: number
parsing_completed_at: number
cleaning_completed_at: number
splitting_completed_at: number
tokens: number
indexing_latency: number
completed_at: number
paused_by: string
paused_at: number
stopped_at: number
indexing_status: string
disabled_at: number
disabled_by: string
archived_reason: 'rule_modified' | 're_upload'
archived_by: string
archived_at: number
doc_type?: DocType | null | 'others'
doc_metadata?: DocMetadata | null
segment_count: number
[key: string]: any
}
export type DocMetadata = {
title: string
language: string
author: string
publisher: string
publicationDate: string
ISBN: string
category: string
[key: string]: string
}
export const CUSTOMIZABLE_DOC_TYPES = [
'book',
'web_page',
'paper',
'social_media_post',
'personal_document',
'business_document',
'im_chat_log',
] as const
export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
export type FixedDocType = typeof FIXED_DOC_TYPES[number]
export type DocType = CustomizableDocType | FixedDocType
export type DocumentDetailResponse = FullDocumentDetail
export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
export type SegmentsQuery = {
last_id?: string
limit: number
// status?: SegmentStatus
hit_count_gte?: number
keyword?: string
enabled?: boolean
}
export type SegmentDetailModel = {
id: string
position: number
document_id: string
content: string
word_count: number
tokens: number
keywords: string[]
index_node_id: string
index_node_hash: string
hit_count: number
enabled: boolean
disabled_at: number
disabled_by: string
status: SegmentStatus
created_by: string
created_at: number
indexing_at: number
completed_at: number
error: string | null
stopped_at: number
answer?: string
}
export type SegmentsResponse = {
data: SegmentDetailModel[]
has_more: boolean
limit: number
total: number
}
export type HitTestingRecord = {
id: string
content: string
source: 'app' | 'hit_testing' | 'plugin'
source_app_id: string
created_by_role: 'account' | 'end_user'
created_by: string
created_at: number
}
export type HitTesting = {
segment: Segment
score: number
tsne_position: TsnePosition
}
export type ExternalKnowledgeBaseHitTesting = {
content: string
title: string
score: number
metadata: {
'x-amz-bedrock-kb-source-uri': string
'x-amz-bedrock-kb-data-source-id': string
}
}
export type Segment = {
id: string
document: Document
content: string
position: number
word_count: number
tokens: number
keywords: string[]
hit_count: number
index_node_hash: string
}
export type Document = {
id: string
data_source_type: string
name: string
doc_type: DocType
}
export type HitTestingRecordsResponse = {
data: HitTestingRecord[]
has_more: boolean
limit: number
total: number
page: number
}
export type TsnePosition = {
x: number
y: number
}
export type HitTestingResponse = {
query: {
content: string
tsne_position: TsnePosition
}
records: Array<HitTesting>
}
export type ExternalKnowledgeBaseHitTestingResponse = {
query: {
content: string
}
records: Array<ExternalKnowledgeBaseHitTesting>
}
export type RelatedApp = {
id: string
name: string
mode: AppMode
icon_type: AppIconType | null
icon: string
icon_background: string
icon_url: string
}
export type RelatedAppResponse = {
data: Array<RelatedApp>
total: number
}
export type SegmentUpdater = {
content: string
answer?: string
keywords?: string[]
}
export enum DocForm {
TEXT = 'text_model',
QA = 'qa_model',
}
export type ErrorDocsResponse = {
data: IndexingStatusResponse[]
total: number
}
export type SelectedDatasetsMode = {
allHighQuality: boolean
allHighQualityVectorSearch: boolean
allHighQualityFullTextSearch: boolean
allEconomic: boolean
mixtureHighQualityAndEconomic: boolean
allInternal: boolean
allExternal: boolean
mixtureInternalAndExternal: boolean
inconsistentEmbeddingModel: boolean
}
export enum WeightedScoreEnum {
SemanticFirst = 'semantic_first',
KeywordFirst = 'keyword_first',
Customized = 'customized',
}
export enum RerankingModeEnum {
RerankingModel = 'reranking_model',
WeightedScore = 'weighted_score',
}
export const DEFAULT_WEIGHTED_SCORE = {
allHighQualityVectorSearch: {
semantic: 1.0,
keyword: 0,
},
allHighQualityFullTextSearch: {
semantic: 0,
keyword: 1.0,
},
other: {
semantic: 0.7,
keyword: 0.3,
},
}