fix: Ignore some emtpy page_content when append to split_documents (#2898)

This commit is contained in:
listeng 2024-03-19 20:55:15 +08:00 committed by GitHub
parent 4419d357c4
commit 696efe494e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -45,11 +45,12 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
# delete Spliter character
page_content = document_node.page_content
if page_content.startswith(".") or page_content.startswith(""):
page_content = page_content[1:]
page_content = page_content[1:].strip()
else:
page_content = page_content
document_node.page_content = page_content
split_documents.append(document_node)
if len(page_content) > 0:
document_node.page_content = page_content
split_documents.append(document_node)
all_documents.extend(split_documents)
return all_documents