dify/web/models/datasets.ts

500 lines
9.8 KiB
TypeScript
Raw Normal View History

import type { DataSourceNotionPage } from './common'
import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
import type { Tag } from '@/app/components/base/tag-management/constant'
export enum DataSourceType {
FILE = 'upload_file',
NOTION = 'notion_import',
WEB = 'website_crawl',
}
2023-05-15 08:51:32 +08:00
export type DatasetPermission = 'only_me' | 'all_team_members' | 'partial_members'
2023-05-15 08:51:32 +08:00
export type DataSet = {
id: string
name: string
2023-05-19 17:36:44 +08:00
icon: string
icon_background: string
2023-05-15 08:51:32 +08:00
description: string
permission: DatasetPermission
data_source_type: DataSourceType
2023-05-15 08:51:32 +08:00
indexing_technique: 'high_quality' | 'economy'
created_by: string
updated_by: string
updated_at: number
app_count: number
document_count: number
word_count: number
embedding_model: string
embedding_model_provider: string
embedding_available: boolean
retrieval_model_dict: RetrievalConfig
retrieval_model: RetrievalConfig
tags: Tag[]
partial_member_list?: any[]
2023-05-15 08:51:32 +08:00
}
export type CustomFile = File & {
id?: string
extension?: string
mime_type?: string
created_by?: string
created_at?: number
}
export type CrawlOptions = {
crawl_sub_pages: boolean
only_main_content: boolean
includes: string
excludes: string
limit: number | string
max_depth: number | string
}
export type CrawlResultItem = {
title: string
markdown: string
description: string
source_url: string
}
export type FileItem = {
fileID: string
file: CustomFile
progress: number
2023-05-15 08:51:32 +08:00
}
export type DataSetListResponse = {
data: DataSet[]
has_more: boolean
limit: number
page: number
total: number
2023-05-15 08:51:32 +08:00
}
export type QA = {
question: string
answer: string
}
2023-05-15 08:51:32 +08:00
export type IndexingEstimateResponse = {
tokens: number
total_price: number
currency: string
total_segments: number
preview: string[]
qa_preview?: QA[]
2023-05-15 08:51:32 +08:00
}
export type FileIndexingEstimateResponse = {
2023-05-15 08:51:32 +08:00
total_nodes: number
} & IndexingEstimateResponse
2023-05-15 08:51:32 +08:00
export type IndexingStatusResponse = {
id: string
indexing_status: DocumentIndexingStatus
processing_started_at: number
parsing_completed_at: number
cleaning_completed_at: number
splitting_completed_at: number
completed_at: any
paused_at: any
error: any
stopped_at: any
completed_segments: number
total_segments: number
}
export type IndexingStatusBatchResponse = {
data: IndexingStatusResponse[]
}
2023-05-15 08:51:32 +08:00
export type ProcessMode = 'automatic' | 'custom'
export type ProcessRuleResponse = {
mode: ProcessMode
rules: Rules
}
export type Rules = {
pre_processing_rules: PreProcessingRule[]
segmentation: Segmentation
}
export type PreProcessingRule = {
id: string
enabled: boolean
}
export type Segmentation = {
separator: string
max_tokens: number
chunk_overlap: number
2023-05-15 08:51:32 +08:00
}
export const DocumentIndexingStatusList = [
'waiting',
'parsing',
'cleaning',
'splitting',
'indexing',
'paused',
'error',
'completed',
] as const
export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
export const DisplayStatusList = [
'queuing',
'indexing',
'paused',
'error',
'available',
'enabled',
'disabled',
'archived',
] as const
export type DocumentDisplayStatus = typeof DisplayStatusList[number]
2023-05-15 08:51:32 +08:00
export type DataSourceInfo = {
upload_file: {
id: string
name: string
size: number
mime_type: string
created_at: number
created_by: string
extension: string
}
notion_page_icon?: string
job_id: string
url: string
2023-05-15 08:51:32 +08:00
}
export type InitialDocumentDetail = {
id: string
batch: string
2023-05-15 08:51:32 +08:00
position: number
dataset_id: string
data_source_type: DataSourceType
2023-05-15 08:51:32 +08:00
data_source_info: DataSourceInfo
dataset_process_rule_id: string
name: string
created_from: 'api' | 'web'
created_by: string
created_at: number
indexing_status: DocumentIndexingStatus
display_status: DocumentDisplayStatus
completed_segments?: number
total_segments?: number
doc_form: 'text_model' | 'qa_model'
doc_language: string
2023-05-15 08:51:32 +08:00
}
export type SimpleDocumentDetail = InitialDocumentDetail & {
enabled: boolean
word_count: number
error?: string | null
archived: boolean
updated_at: number
hit_count: number
dataset_process_rule_id?: string
data_source_detail_dict?: {
upload_file: {
name: string
extension: string
}
}
2023-05-15 08:51:32 +08:00
}
export type DocumentListResponse = {
data: SimpleDocumentDetail[]
has_more: boolean
total: number
page: number
limit: number
}
2023-10-07 17:42:16 +08:00
export type DocumentReq = {
2023-05-15 08:51:32 +08:00
original_document_id?: string
indexing_technique?: string
doc_form: 'text_model' | 'qa_model'
doc_language: string
2023-05-15 08:51:32 +08:00
process_rule: ProcessRule
}
2023-10-07 17:42:16 +08:00
export type CreateDocumentReq = DocumentReq & {
data_source: DataSource
retrieval_model: RetrievalConfig
embedding_model: string
embedding_model_provider: string
2023-10-07 17:42:16 +08:00
}
export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
dataset_id: string
}
2023-05-15 08:51:32 +08:00
export type DataSource = {
type: DataSourceType
info_list: {
data_source_type: DataSourceType
notion_info_list?: NotionInfo[]
file_info_list?: {
file_ids: string[]
}
website_info_list?: {
provider: string
job_id: string
urls: string[]
}
}
}
export type NotionInfo = {
workspace_id: string
pages: DataSourceNotionPage[]
}
export type NotionPage = {
page_id: string
2023-05-15 08:51:32 +08:00
type: string
}
export type ProcessRule = {
mode: string
rules: Rules
}
export type createDocumentResponse = {
dataset?: DataSet
batch: string
documents: InitialDocumentDetail[]
2023-05-15 08:51:32 +08:00
}
export type FullDocumentDetail = SimpleDocumentDetail & {
batch: string
created_api_request_id: string
processing_started_at: number
parsing_completed_at: number
cleaning_completed_at: number
splitting_completed_at: number
tokens: number
indexing_latency: number
completed_at: number
paused_by: string
paused_at: number
stopped_at: number
indexing_status: string
disabled_at: number
disabled_by: string
archived_reason: 'rule_modified' | 're_upload'
archived_by: string
archived_at: number
doc_type?: DocType | null | 'others'
2023-05-15 08:51:32 +08:00
doc_metadata?: DocMetadata | null
segment_count: number
[key: string]: any
}
export type DocMetadata = {
title: string
language: string
author: string
publisher: string
publicationDate: string
ISBN: string
category: string
[key: string]: string
}
export const CUSTOMIZABLE_DOC_TYPES = [
'book',
'web_page',
'paper',
'social_media_post',
'personal_document',
'business_document',
'im_chat_log',
] as const
export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
export type FixedDocType = typeof FIXED_DOC_TYPES[number]
export type DocType = CustomizableDocType | FixedDocType
2023-05-15 08:51:32 +08:00
export type DocumentDetailResponse = FullDocumentDetail
export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
export type SegmentsQuery = {
last_id?: string
limit: number
// status?: SegmentStatus
hit_count_gte?: number
keyword?: string
enabled?: boolean
}
export type SegmentDetailModel = {
id: string
position: number
document_id: string
content: string
word_count: number
tokens: number
keywords: string[]
index_node_id: string
index_node_hash: string
hit_count: number
enabled: boolean
disabled_at: number
disabled_by: string
status: SegmentStatus
created_by: string
created_at: number
indexing_at: number
completed_at: number
error: string | null
stopped_at: number
answer?: string
2023-05-15 08:51:32 +08:00
}
export type SegmentsResponse = {
data: SegmentDetailModel[]
has_more: boolean
limit: number
total: number
}
export type HitTestingRecord = {
id: string
content: string
source: 'app' | 'hit_testing' | 'plugin'
source_app_id: string
created_by_role: 'account' | 'end_user'
created_by: string
created_at: number
}
export type HitTesting = {
segment: Segment
score: number
tsne_position: TsnePosition
}
export type Segment = {
id: string
document: Document
content: string
position: number
word_count: number
tokens: number
keywords: string[]
hit_count: number
index_node_hash: string
}
export type Document = {
id: string
data_source_type: string
name: string
doc_type: DocType
}
export type HitTestingRecordsResponse = {
data: HitTestingRecord[]
has_more: boolean
limit: number
total: number
page: number
}
export type TsnePosition = {
x: number
y: number
}
export type HitTestingResponse = {
query: {
content: string
tsne_position: TsnePosition
}
records: Array<HitTesting>
}
export type RelatedApp = {
id: string
name: string
mode: AppMode
icon_type: AppIconType | null
2023-05-15 08:51:32 +08:00
icon: string
icon_background: string
icon_url: string
2023-05-15 08:51:32 +08:00
}
export type RelatedAppResponse = {
data: Array<RelatedApp>
total: number
}
2024-09-08 13:14:11 +08:00
export type SegmentUpdater = {
content: string
answer?: string
2023-08-18 17:18:58 +08:00
keywords?: string[]
}
export enum DocForm {
TEXT = 'text_model',
QA = 'qa_model',
}
export type ErrorDocsResponse = {
data: IndexingStatusResponse[]
total: number
}
2024-07-24 12:50:48 +08:00
export type SelectedDatasetsMode = {
allHighQuality: boolean
allHighQualityVectorSearch: boolean
allHighQualityFullTextSearch: boolean
allEconomic: boolean
mixtureHighQualityAndEconomic: boolean
inconsistentEmbeddingModel: boolean
}
export enum WeightedScoreEnum {
SemanticFirst = 'semantic_first',
KeywordFirst = 'keyword_first',
Customized = 'customized',
}
export enum RerankingModeEnum {
RerankingModel = 'reranking_model',
WeightedScore = 'weighted_score',
}
export const DEFAULT_WEIGHTED_SCORE = {
allHighQualityVectorSearch: {
semantic: 1.0,
keyword: 0,
},
allHighQualityFullTextSearch: {
semantic: 0,
keyword: 1.0,
},
semanticFirst: {
semantic: 0.7,
keyword: 0.3,
},
keywordFirst: {
semantic: 0.3,
keyword: 0.7,
},
other: {
semantic: 0.7,
keyword: 0.3,
},
}