mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
parent
185c2f86cd
commit
df1509983c
|
@ -30,11 +30,18 @@ class UnstructuredPPTLoader(BaseLoader):
|
||||||
from unstructured.partition.ppt import partition_ppt
|
from unstructured.partition.ppt import partition_ppt
|
||||||
|
|
||||||
elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
|
elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
|
||||||
from unstructured.chunking.title import chunk_by_title
|
text_by_page = {}
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
for element in elements:
|
||||||
documents = []
|
page = element.metadata.page_number
|
||||||
for chunk in chunks:
|
text = element.text
|
||||||
text = chunk.text.strip()
|
if page in text_by_page:
|
||||||
documents.append(Document(page_content=text))
|
text_by_page[page] += "\n" + text
|
||||||
|
else:
|
||||||
|
text_by_page[page] = text
|
||||||
|
|
||||||
|
combined_texts = list(text_by_page.values())
|
||||||
|
documents = []
|
||||||
|
for combined_text in combined_texts:
|
||||||
|
text = combined_text.strip()
|
||||||
|
documents.append(Document(page_content=text))
|
||||||
return documents
|
return documents
|
||||||
|
|
|
@ -30,11 +30,19 @@ class UnstructuredPPTXLoader(BaseLoader):
|
||||||
from unstructured.partition.pptx import partition_pptx
|
from unstructured.partition.pptx import partition_pptx
|
||||||
|
|
||||||
elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
|
elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
|
||||||
from unstructured.chunking.title import chunk_by_title
|
text_by_page = {}
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
for element in elements:
|
||||||
|
page = element.metadata.page_number
|
||||||
|
text = element.text
|
||||||
|
if page in text_by_page:
|
||||||
|
text_by_page[page] += "\n" + text
|
||||||
|
else:
|
||||||
|
text_by_page[page] = text
|
||||||
|
|
||||||
|
combined_texts = list(text_by_page.values())
|
||||||
documents = []
|
documents = []
|
||||||
for chunk in chunks:
|
for combined_text in combined_texts:
|
||||||
text = chunk.text.strip()
|
text = combined_text.strip()
|
||||||
documents.append(Document(page_content=text))
|
documents.append(Document(page_content=text))
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
|
|
@ -529,6 +529,13 @@ class IndexingRunner:
|
||||||
hash = helper.generate_text_hash(document_node.page_content)
|
hash = helper.generate_text_hash(document_node.page_content)
|
||||||
document_node.metadata['doc_id'] = doc_id
|
document_node.metadata['doc_id'] = doc_id
|
||||||
document_node.metadata['doc_hash'] = hash
|
document_node.metadata['doc_hash'] = hash
|
||||||
|
# delete Spliter character
|
||||||
|
page_content = document_node.page_content
|
||||||
|
if page_content.startswith(".") or page_content.startswith("。"):
|
||||||
|
page_content = page_content[1:]
|
||||||
|
else:
|
||||||
|
page_content = page_content
|
||||||
|
document_node.page_content = page_content
|
||||||
split_documents.append(document_node)
|
split_documents.append(document_node)
|
||||||
all_documents.extend(split_documents)
|
all_documents.extend(split_documents)
|
||||||
# processing qa document
|
# processing qa document
|
||||||
|
|
Loading…
Reference in New Issue
Block a user