mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
feat: Add hyperlink parsing to the DOCX document. (#7017)
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions
This commit is contained in:
parent
ffa992acf7
commit
72c75b75cf
|
@ -1,9 +1,12 @@
|
||||||
"""Abstract interface for document loader implementations."""
|
"""Abstract interface for document loader implementations."""
|
||||||
import datetime
|
import datetime
|
||||||
|
import logging
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
import uuid
|
import uuid
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
@ -16,6 +19,7 @@ from extensions.ext_database import db
|
||||||
from extensions.ext_storage import storage
|
from extensions.ext_storage import storage
|
||||||
from models.model import UploadFile
|
from models.model import UploadFile
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class WordExtractor(BaseExtractor):
|
class WordExtractor(BaseExtractor):
|
||||||
"""Load docx files.
|
"""Load docx files.
|
||||||
|
@ -197,6 +201,30 @@ class WordExtractor(BaseExtractor):
|
||||||
|
|
||||||
image_map = self._extract_images_from_docx(doc, image_folder)
|
image_map = self._extract_images_from_docx(doc, image_folder)
|
||||||
|
|
||||||
|
hyperlinks_url = None
|
||||||
|
url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+')
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
for run in para.runs:
|
||||||
|
if run.text and hyperlinks_url:
|
||||||
|
result = f' [{run.text}]({hyperlinks_url}) '
|
||||||
|
run.text = result
|
||||||
|
hyperlinks_url = None
|
||||||
|
if 'HYPERLINK' in run.element.xml:
|
||||||
|
try:
|
||||||
|
xml = ET.XML(run.element.xml)
|
||||||
|
x_child = [c for c in xml.iter() if c is not None]
|
||||||
|
for x in x_child:
|
||||||
|
if x_child is None:
|
||||||
|
continue
|
||||||
|
if x.tag.endswith('instrText'):
|
||||||
|
for i in url_pattern.findall(x.text):
|
||||||
|
hyperlinks_url = str(i)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_paragraph(paragraph):
|
def parse_paragraph(paragraph):
|
||||||
paragraph_content = []
|
paragraph_content = []
|
||||||
for run in paragraph.runs:
|
for run in paragraph.runs:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user