Use python-docx to extract docx files (#2654)

This commit is contained in:
Bowen Liang 2024-03-07 18:24:55 +08:00 committed by GitHub
parent c0b82f8e58
commit b163545771
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 11 additions and 9 deletions

View File

@ -10,7 +10,7 @@ from core.rag.models.document import Document
class WordExtractor(BaseExtractor):
"""Load pdf files.
"""Load docx files.
Args:
@ -46,14 +46,16 @@ class WordExtractor(BaseExtractor):
def extract(self) -> list[Document]:
"""Load given path as single page."""
import docx2txt
from docx import Document as docx_Document
return [
Document(
page_content=docx2txt.process(self.file_path),
metadata={"source": self.file_path},
)
]
document = docx_Document(self.file_path)
doc_texts = [paragraph.text for paragraph in document.paragraphs]
content = '\n'.join(doc_texts)
return [Document(
page_content=content,
metadata={"source": self.file_path},
)]
@staticmethod
def _is_valid_url(url: str) -> bool:

View File

@ -32,7 +32,7 @@ celery==5.2.7
redis~=4.5.4
openpyxl==3.1.2
chardet~=5.1.0
docx2txt==0.8
python-docx~=1.1.0
pypdfium2==4.16.0
resend~=0.7.0
pyjwt~=2.8.0