mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
feat: Add hyperlink parsing to the DOCX document. (#7017)
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions
This commit is contained in:
parent
ffa992acf7
commit
72c75b75cf
|
@ -1,9 +1,12 @@
|
|||
"""Abstract interface for document loader implementations."""
|
||||
import datetime
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import uuid
|
||||
import xml.etree.ElementTree as ET
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
@ -16,6 +19,7 @@ from extensions.ext_database import db
|
|||
from extensions.ext_storage import storage
|
||||
from models.model import UploadFile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class WordExtractor(BaseExtractor):
|
||||
"""Load docx files.
|
||||
|
@ -197,6 +201,30 @@ class WordExtractor(BaseExtractor):
|
|||
|
||||
image_map = self._extract_images_from_docx(doc, image_folder)
|
||||
|
||||
hyperlinks_url = None
|
||||
url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+')
|
||||
for para in doc.paragraphs:
|
||||
for run in para.runs:
|
||||
if run.text and hyperlinks_url:
|
||||
result = f' [{run.text}]({hyperlinks_url}) '
|
||||
run.text = result
|
||||
hyperlinks_url = None
|
||||
if 'HYPERLINK' in run.element.xml:
|
||||
try:
|
||||
xml = ET.XML(run.element.xml)
|
||||
x_child = [c for c in xml.iter() if c is not None]
|
||||
for x in x_child:
|
||||
if x_child is None:
|
||||
continue
|
||||
if x.tag.endswith('instrText'):
|
||||
for i in url_pattern.findall(x.text):
|
||||
hyperlinks_url = str(i)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
|
||||
|
||||
|
||||
def parse_paragraph(paragraph):
|
||||
paragraph_content = []
|
||||
for run in paragraph.runs:
|
||||
|
|
Loading…
Reference in New Issue
Block a user