feat: Add hyperlink parsing to the DOCX document. (#7017)
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions

This commit is contained in:
chenxu9741 2024-08-07 16:01:14 +08:00 committed by GitHub
parent ffa992acf7
commit 72c75b75cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,9 +1,12 @@
"""Abstract interface for document loader implementations."""
import datetime
import logging
import mimetypes
import os
import re
import tempfile
import uuid
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
import requests
@ -16,6 +19,7 @@ from extensions.ext_database import db
from extensions.ext_storage import storage
from models.model import UploadFile
logger = logging.getLogger(__name__)
class WordExtractor(BaseExtractor):
"""Load docx files.
@ -197,6 +201,30 @@ class WordExtractor(BaseExtractor):
image_map = self._extract_images_from_docx(doc, image_folder)
hyperlinks_url = None
url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+')
for para in doc.paragraphs:
for run in para.runs:
if run.text and hyperlinks_url:
result = f' [{run.text}]({hyperlinks_url}) '
run.text = result
hyperlinks_url = None
if 'HYPERLINK' in run.element.xml:
try:
xml = ET.XML(run.element.xml)
x_child = [c for c in xml.iter() if c is not None]
for x in x_child:
if x_child is None:
continue
if x.tag.endswith('instrText'):
for i in url_pattern.findall(x.text):
hyperlinks_url = str(i)
except Exception as e:
logger.error(e)
def parse_paragraph(paragraph):
paragraph_content = []
for run in paragraph.runs: