From 72c75b75cfe8a4e74a131475e6cdf1283683e455 Mon Sep 17 00:00:00 2001 From: chenxu9741 Date: Wed, 7 Aug 2024 16:01:14 +0800 Subject: [PATCH] feat: Add hyperlink parsing to the DOCX document. (#7017) --- api/core/rag/extractor/word_extractor.py | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index de91363f5f..2a77c1afc6 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -1,9 +1,12 @@ """Abstract interface for document loader implementations.""" import datetime +import logging import mimetypes import os +import re import tempfile import uuid +import xml.etree.ElementTree as ET from urllib.parse import urlparse import requests @@ -16,6 +19,7 @@ from extensions.ext_database import db from extensions.ext_storage import storage from models.model import UploadFile +logger = logging.getLogger(__name__) class WordExtractor(BaseExtractor): """Load docx files. @@ -197,6 +201,30 @@ class WordExtractor(BaseExtractor): image_map = self._extract_images_from_docx(doc, image_folder) + hyperlinks_url = None + url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+') + for para in doc.paragraphs: + for run in para.runs: + if run.text and hyperlinks_url: + result = f' [{run.text}]({hyperlinks_url}) ' + run.text = result + hyperlinks_url = None + if 'HYPERLINK' in run.element.xml: + try: + xml = ET.XML(run.element.xml) + x_child = [c for c in xml.iter() if c is not None] + for x in x_child: + if x_child is None: + continue + if x.tag.endswith('instrText'): + for i in url_pattern.findall(x.text): + hyperlinks_url = str(i) + except Exception as e: + logger.error(e) + + + + def parse_paragraph(paragraph): paragraph_content = [] for run in paragraph.runs: