mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
Use python-docx
to extract docx files (#2654)
This commit is contained in:
parent
c0b82f8e58
commit
b163545771
|
@ -10,7 +10,7 @@ from core.rag.models.document import Document
|
|||
|
||||
|
||||
class WordExtractor(BaseExtractor):
|
||||
"""Load pdf files.
|
||||
"""Load docx files.
|
||||
|
||||
|
||||
Args:
|
||||
|
@ -46,14 +46,16 @@ class WordExtractor(BaseExtractor):
|
|||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Load given path as single page."""
|
||||
import docx2txt
|
||||
from docx import Document as docx_Document
|
||||
|
||||
return [
|
||||
Document(
|
||||
page_content=docx2txt.process(self.file_path),
|
||||
metadata={"source": self.file_path},
|
||||
)
|
||||
]
|
||||
document = docx_Document(self.file_path)
|
||||
doc_texts = [paragraph.text for paragraph in document.paragraphs]
|
||||
content = '\n'.join(doc_texts)
|
||||
|
||||
return [Document(
|
||||
page_content=content,
|
||||
metadata={"source": self.file_path},
|
||||
)]
|
||||
|
||||
@staticmethod
|
||||
def _is_valid_url(url: str) -> bool:
|
||||
|
|
|
@ -32,7 +32,7 @@ celery==5.2.7
|
|||
redis~=4.5.4
|
||||
openpyxl==3.1.2
|
||||
chardet~=5.1.0
|
||||
docx2txt==0.8
|
||||
python-docx~=1.1.0
|
||||
pypdfium2==4.16.0
|
||||
resend~=0.7.0
|
||||
pyjwt~=2.8.0
|
||||
|
|
Loading…
Reference in New Issue
Block a user