Feat: chunk overlap supported (#2209)

Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
KVOJJJin 2024-01-26 13:24:40 +08:00 committed by GitHub
parent 3322710dac
commit 89fcf4ea7c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 53 additions and 8 deletions

View File

@ -562,7 +562,7 @@ class IndexingRunner:
character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
chunk_size=segmentation["max_tokens"],
chunk_overlap=0,
chunk_overlap=segmentation.get('chunk_overlap', 0),
fixed_separator=separator,
separators=["\n\n", "", ".", " ", ""],
embedding_model_instance=embedding_model_instance
@ -571,7 +571,7 @@ class IndexingRunner:
# Automatic segmentation
character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
chunk_overlap=0,
chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'],
separators=["\n\n", "", ".", " ", ""],
embedding_model_instance=embedding_model_instance
)

View File

@ -134,7 +134,8 @@ class DatasetProcessRule(db.Model):
],
'segmentation': {
'delimiter': '\n',
'max_tokens': 1000
'max_tokens': 500,
'chunk_overlap': 50
}
}

View File

@ -241,7 +241,8 @@ class DocumentService:
],
'segmentation': {
'delimiter': '\n',
'max_tokens': 500
'max_tokens': 500,
'chunk_overlap': 50
}
}
}

View File

@ -18,7 +18,7 @@
}
.form .label {
@apply pt-6 pb-2;
@apply pt-6 pb-2 flex items-center;
font-weight: 500;
font-size: 16px;
line-height: 24px;

View File

@ -33,13 +33,14 @@ import { DataSourceType, DocForm } from '@/models/datasets'
import NotionIcon from '@/app/components/base/notion-icon'
import Switch from '@/app/components/base/switch'
import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
import { XClose } from '@/app/components/base/icons/src/vender/line/general'
import { HelpCircle, XClose } from '@/app/components/base/icons/src/vender/line/general'
import { useDatasetDetailContext } from '@/context/dataset-detail'
import I18n from '@/context/i18n'
import { IS_CE_EDITION } from '@/config'
import { RETRIEVE_METHOD } from '@/types/app'
import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
import Tooltip from '@/app/components/base/tooltip'
import TooltipPlus from '@/app/components/base/tooltip-plus'
import { useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
import { LanguagesSupportedUnderscore, getModelRuntimeSupported } from '@/utils/language'
@ -99,7 +100,8 @@ const StepTwo = ({
const [previewScrolled, setPreviewScrolled] = useState(false)
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
const [segmentIdentifier, setSegmentIdentifier] = useState('\\n')
const [max, setMax] = useState(1000)
const [max, setMax] = useState(500)
const [overlap, setOverlap] = useState(50)
const [rules, setRules] = useState<PreProcessingRule[]>([])
const [defaultConfig, setDefaultConfig] = useState<Rules>()
const hasSetIndexType = !!indexingType
@ -171,6 +173,7 @@ const StepTwo = ({
if (defaultConfig) {
setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n')
setMax(defaultConfig.segmentation.max_tokens)
setOverlap(defaultConfig.segmentation.chunk_overlap)
setRules(defaultConfig.pre_processing_rules)
}
}
@ -207,6 +210,7 @@ const StepTwo = ({
segmentation: {
separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier,
max_tokens: max,
chunk_overlap: overlap,
},
}
processRule.rules = ruleObj
@ -275,6 +279,10 @@ const StepTwo = ({
} = useModelListAndDefaultModelAndCurrentProviderAndModel(3)
const getCreationParams = () => {
let params
if (segmentationType === SegmentType.CUSTOM && overlap > max) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
return
}
if (isSetting) {
params = {
original_document_id: documentDetail?.id,
@ -337,6 +345,7 @@ const StepTwo = ({
const separator = res.rules.segmentation.separator
setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
setMax(res.rules.segmentation.max_tokens)
setOverlap(res.rules.segmentation.chunk_overlap)
setRules(res.rules.pre_processing_rules)
setDefaultConfig(res.rules)
}
@ -350,8 +359,10 @@ const StepTwo = ({
const rules = documentDetail.dataset_process_rule.rules
const separator = rules.segmentation.separator
const max = rules.segmentation.max_tokens
const overlap = rules.segmentation.chunk_overlap
setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
setMax(max)
setOverlap(overlap)
setRules(rules.pre_processing_rules)
setDefaultConfig(rules)
}
@ -569,13 +580,35 @@ const StepTwo = ({
<input
type="number"
className={s.input}
placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
value={max}
min={1}
onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
/>
</div>
</div>
<div className={s.formRow}>
<div className='w-full'>
<div className={s.label}>
{t('datasetCreation.stepTwo.overlap')}
<TooltipPlus popupContent={
<div className='max-w-[200px]'>
{t('datasetCreation.stepTwo.overlapTip')}
</div>
}>
<HelpCircle className='ml-1 w-3.5 h-3.5 text-gray-400' />
</TooltipPlus>
</div>
<input
type="number"
className={s.input}
placeholder={t('datasetCreation.stepTwo.overlap') || ''}
value={overlap}
min={1}
onChange={e => setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))}
/>
</div>
</div>
<div className={s.formRow}>
<div className='w-full flex flex-col gap-1'>
<div className={s.label}>{t('datasetCreation.stepTwo.rules')}</div>

View File

@ -59,6 +59,9 @@ const translation = {
separator: 'Segment identifier',
separatorPlaceholder: 'For example, newline (\\\\n) or special separator (such as "***")',
maxLength: 'Maximum chunk length',
overlap: 'Chunk overlap',
overlapTip: 'Setting the chunk overlap can maintain the semantic relevance between them, enhancing the retrieve effect. It is recommended to set 10%-25% of the maximum chunk size.',
overlapCheck: 'chunk overlap should not bigger than maximun chunk length',
rules: 'Text preprocessing rules',
removeExtraSpaces: 'Replace consecutive spaces, newlines and tabs',
removeUrlEmails: 'Delete all URLs and email addresses',

View File

@ -59,6 +59,9 @@ const translation = {
separator: 'Identificador de segmento',
separatorPlaceholder: 'Por exemplo, nova linha (\\\\n) ou separador especial (como "***")',
maxLength: 'Comprimento máximo do fragmento',
overlap: 'Sobreposição de blocos',
overlapTip: 'Configurar a sobreposição de blocos pode manter a relevância semântica entre eles, melhorando o efeito de recuperação. É recomendado definir de 10% a 25% do tamanho máximo do bloco.',
overlapCheck: 'a sobreposição de blocos não deve ser maior que o comprimento máximo do bloco',
rules: 'Regras de pré-processamento de texto',
removeExtraSpaces: 'Substituir espaços consecutivos, quebras de linha e tabulações',
removeUrlEmails: 'Excluir todos os URLs e endereços de e-mail',

View File

@ -59,6 +59,9 @@ const translation = {
separator: '分段标识符',
separatorPlaceholder: '例如换行符(\n或特定的分隔符如 "***"',
maxLength: '分段最大长度',
overlap: '分段重叠长度',
overlapTip: '设置分段之间的重叠长度可以保留分段之间的语义关系提升召回效果。建议设置为最大分段长度的10%-25%',
overlapCheck: '分段重叠长度不能大于分段最大长度',
rules: '文本预处理规则',
removeExtraSpaces: '替换掉连续的空格、换行符和制表符',
removeUrlEmails: '删除所有 URL 和电子邮件地址',

View File

@ -108,6 +108,7 @@ export type PreProcessingRule = {
export type Segmentation = {
separator: string
max_tokens: number
chunk_overlap: number
}
export const DocumentIndexingStatusList = [