feat: Add hyperlink parsing to the DOCX document. (#7017)
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions

This commit is contained in:
chenxu9741 2024-08-07 16:01:14 +08:00 committed by GitHub
parent ffa992acf7
commit 72c75b75cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,9 +1,12 @@
"""Abstract interface for document loader implementations.""" """Abstract interface for document loader implementations."""
import datetime import datetime
import logging
import mimetypes import mimetypes
import os import os
import re
import tempfile import tempfile
import uuid import uuid
import xml.etree.ElementTree as ET
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@ -16,6 +19,7 @@ from extensions.ext_database import db
from extensions.ext_storage import storage from extensions.ext_storage import storage
from models.model import UploadFile from models.model import UploadFile
logger = logging.getLogger(__name__)
class WordExtractor(BaseExtractor): class WordExtractor(BaseExtractor):
"""Load docx files. """Load docx files.
@ -197,6 +201,30 @@ class WordExtractor(BaseExtractor):
image_map = self._extract_images_from_docx(doc, image_folder) image_map = self._extract_images_from_docx(doc, image_folder)
hyperlinks_url = None
url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+')
for para in doc.paragraphs:
for run in para.runs:
if run.text and hyperlinks_url:
result = f' [{run.text}]({hyperlinks_url}) '
run.text = result
hyperlinks_url = None
if 'HYPERLINK' in run.element.xml:
try:
xml = ET.XML(run.element.xml)
x_child = [c for c in xml.iter() if c is not None]
for x in x_child:
if x_child is None:
continue
if x.tag.endswith('instrText'):
for i in url_pattern.findall(x.text):
hyperlinks_url = str(i)
except Exception as e:
logger.error(e)
def parse_paragraph(paragraph): def parse_paragraph(paragraph):
paragraph_content = [] paragraph_content = []
for run in paragraph.runs: for run in paragraph.runs: