mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 03:32:23 +08:00
Use python-docx
to extract docx files (#2654)
This commit is contained in:
parent
c0b82f8e58
commit
b163545771
|
@ -10,7 +10,7 @@ from core.rag.models.document import Document
|
||||||
|
|
||||||
|
|
||||||
class WordExtractor(BaseExtractor):
|
class WordExtractor(BaseExtractor):
|
||||||
"""Load pdf files.
|
"""Load docx files.
|
||||||
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -46,14 +46,16 @@ class WordExtractor(BaseExtractor):
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Load given path as single page."""
|
"""Load given path as single page."""
|
||||||
import docx2txt
|
from docx import Document as docx_Document
|
||||||
|
|
||||||
return [
|
document = docx_Document(self.file_path)
|
||||||
Document(
|
doc_texts = [paragraph.text for paragraph in document.paragraphs]
|
||||||
page_content=docx2txt.process(self.file_path),
|
content = '\n'.join(doc_texts)
|
||||||
metadata={"source": self.file_path},
|
|
||||||
)
|
return [Document(
|
||||||
]
|
page_content=content,
|
||||||
|
metadata={"source": self.file_path},
|
||||||
|
)]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _is_valid_url(url: str) -> bool:
|
def _is_valid_url(url: str) -> bool:
|
||||||
|
|
|
@ -32,7 +32,7 @@ celery==5.2.7
|
||||||
redis~=4.5.4
|
redis~=4.5.4
|
||||||
openpyxl==3.1.2
|
openpyxl==3.1.2
|
||||||
chardet~=5.1.0
|
chardet~=5.1.0
|
||||||
docx2txt==0.8
|
python-docx~=1.1.0
|
||||||
pypdfium2==4.16.0
|
pypdfium2==4.16.0
|
||||||
resend~=0.7.0
|
resend~=0.7.0
|
||||||
pyjwt~=2.8.0
|
pyjwt~=2.8.0
|
||||||
|
|
Loading…
Reference in New Issue
Block a user