Feat/add unstructured support (#1780)

Co-authored-by: jyong <jyong@dify.ai>
2024-11-16 03:32:23 +08:00 · 2023-12-18 23:24:06 +08:00 · 2023-12-18 23:24:06 +08:00 · 5e34f938c1
commit 5e34f938c1
parent 2fd56cb01c
15 changed files with 361 additions and 14 deletions
--- a/api/.env.example
+++ b/api/.env.example
@ -117,3 +117,6 @@ HOSTED_ANTHROPIC_API_BASE=
 HOSTED_ANTHROPIC_API_KEY=
 HOSTED_ANTHROPIC_QUOTA_LIMIT=600000
 HOSTED_ANTHROPIC_PAID_ENABLED=false
+
+ETL_TYPE=dify
+UNSTRUCTURED_API_URL=
--- a/api/config.py
+++ b/api/config.py
@ -54,7 +54,8 @@ DEFAULTS = {
    'UPLOAD_IMAGE_FILE_SIZE_LIMIT': 10,
    'OUTPUT_MODERATION_BUFFER_SIZE': 300,
    'MULTIMODAL_SEND_IMAGE_FORMAT': 'base64',
-    'INVITE_EXPIRY_HOURS': 72
+    'INVITE_EXPIRY_HOURS': 72,
+    'ETL_TYPE': 'dify',
 }


@ -276,6 +277,9 @@ class Config:
        self.HOSTED_MODERATION_ENABLED = get_bool_env('HOSTED_MODERATION_ENABLED')
        self.HOSTED_MODERATION_PROVIDERS = get_env('HOSTED_MODERATION_PROVIDERS')

+        self.ETL_TYPE = get_env('ETL_TYPE')
+        self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL')
+

 class CloudEditionConfig(Config):

--- a/api/controllers/console/datasets/file.py
+++ b/api/controllers/console/datasets/file.py
@ -69,5 +69,20 @@ class FilePreviewApi(Resource):
        return {'content': text}


+class FileeSupportTypApi(Resource):
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self):
+        etl_type = current_app.config['ETL_TYPE']
+        if etl_type == 'Unstructured':
+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
+                                  'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
+        else:
+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
+        return {'allowed_extensions': allowed_extensions}
+
+
 api.add_resource(FileApi, '/files/upload')
 api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')
+api.add_resource(FileeSupportTypApi, '/files/support-type')
--- a/api/core/data_loader/file_extractor.py
+++ b/api/core/data_loader/file_extractor.py
@ -3,7 +3,8 @@ from pathlib import Path
 from typing import List, Union, Optional

 import requests
-from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader
+from flask import current_app
+from langchain.document_loaders import TextLoader, Docx2txtLoader
 from langchain.schema import Document

 from core.data_loader.loader.csv_loader import CSVLoader
@ -11,6 +12,13 @@ from core.data_loader.loader.excel import ExcelLoader
 from core.data_loader.loader.html import HTMLLoader
 from core.data_loader.loader.markdown import MarkdownLoader
 from core.data_loader.loader.pdf import PdfLoader
+from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader
+from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader
+from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader
+from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader
+from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader
+from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader
+from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader
 from extensions.ext_storage import storage
 from models.model import UploadFile

@ -49,14 +57,34 @@ class FileExtractor:
        input_file = Path(file_path)
        delimiter = '\n'
        file_extension = input_file.suffix.lower()
-        if is_automatic:
-            loader = UnstructuredFileLoader(
-                file_path, strategy="hi_res", mode="elements"
-            )
-            # loader = UnstructuredAPIFileLoader(
-            #     file_path=filenames[0],
-            #     api_key="FAKE_API_KEY",
-            # )
+        etl_type = current_app.config['ETL_TYPE']
+        unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
+        if etl_type == 'Unstructured':
+            if file_extension == '.xlsx':
+                loader = ExcelLoader(file_path)
+            elif file_extension == '.pdf':
+                loader = PdfLoader(file_path, upload_file=upload_file)
+            elif file_extension in ['.md', '.markdown']:
+                loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url)
+            elif file_extension in ['.htm', '.html']:
+                loader = HTMLLoader(file_path)
+            elif file_extension == '.docx':
+                loader = Docx2txtLoader(file_path)
+            elif file_extension == '.csv':
+                loader = CSVLoader(file_path, autodetect_encoding=True)
+            elif file_extension == '.msg':
+                loader = UnstructuredMsgLoader(file_path, unstructured_api_url)
+            elif file_extension == '.eml':
+                loader = UnstructuredEmailLoader(file_path, unstructured_api_url)
+            elif file_extension == '.ppt':
+                loader = UnstructuredPPTLoader(file_path, unstructured_api_url)
+            elif file_extension == '.pptx':
+                loader = UnstructuredPPTXLoader(file_path, unstructured_api_url)
+            elif file_extension == '.xml':
+                loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
+            else:
+                # txt
+                loader = UnstructuredTextLoader(file_path, unstructured_api_url)
        else:
            if file_extension == '.xlsx':
                loader = ExcelLoader(file_path)
--- a/api/core/data_loader/loader/unstructured/unstructured_eml.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_eml.py
@ -0,0 +1,41 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredEmailLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str,
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.email import partition_email
+
+        elements = partition_email(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_markdown.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_markdown.py
@ -0,0 +1,48 @@
+import logging
+from typing import List
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredMarkdownLoader(BaseLoader):
+    """Load md files.
+
+
+    Args:
+        file_path: Path to the file to load.
+
+        remove_hyperlinks: Whether to remove hyperlinks from the text.
+
+        remove_images: Whether to remove images from the text.
+
+        encoding: File encoding to use. If `None`, the file will be loaded
+        with the default system encoding.
+
+        autodetect_encoding: Whether to try to autodetect the file encoding
+            if the specified encoding fails.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str,
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.md import partition_md
+
+        elements = partition_md(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_msg.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_msg.py
@ -0,0 +1,40 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredMsgLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.msg import partition_msg
+
+        elements = partition_msg(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_ppt.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_ppt.py
@ -0,0 +1,40 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredPPTLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.ppt import partition_ppt
+
+        elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_pptx.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_pptx.py
@ -0,0 +1,40 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredPPTXLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.pptx import partition_pptx
+
+        elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_text.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_text.py
@ -0,0 +1,40 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredTextLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.text import partition_text
+
+        elements = partition_text(filename=self._file_path, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_xml.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_xml.py
@ -0,0 +1,40 @@
+import logging
+import re
+from typing import Optional, List, Tuple, cast
+
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+from langchain.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredXmlLoader(BaseLoader):
+    """Load msg files.
+
+
+    Args:
+        file_path: Path to the file to load.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        api_url: str
+    ):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+
+    def load(self) -> List[Document]:
+        from unstructured.partition.xml import partition_xml
+
+        elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
+        from unstructured.chunking.title import chunk_by_title
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@ -397,7 +397,7 @@ class IndexingRunner:
                one_or_none()

            if file_detail:
-                text_docs = FileExtractor.load(file_detail, is_automatic=False)
+                text_docs = FileExtractor.load(file_detail, is_automatic=True)
        elif dataset_document.data_source_type == 'notion_import':
            loader = NotionLoader.from_document(dataset_document)
            text_docs = loader.load()
--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@ -135,7 +135,7 @@ class DatasetProcessRule(db.Model):
        ],
        'segmentation': {
            'delimiter': '\n',
-            'max_tokens': 512
+            'max_tokens': 1000
        }
    }

--- a/api/requirements.txt
+++ b/api/requirements.txt
@ -53,4 +53,6 @@ zhipuai==1.0.7
 werkzeug==2.3.7
 pymilvus==2.3.0
 qdrant-client==1.6.4
-cohere~=4.32
+cohere~=4.32
+unstructured~=0.10.27
+unstructured[docx,pptx]~=0.10.27
--- a/api/services/file_service.py
+++ b/api/services/file_service.py
@ -27,7 +27,13 @@ class FileService:
    @staticmethod
    def upload_file(file: FileStorage, user: Union[Account, EndUser], only_image: bool = False) -> UploadFile:
        extension = file.filename.split('.')[-1]
-        if extension.lower() not in ALLOWED_EXTENSIONS:
+        etl_type = current_app.config['ETL_TYPE']
+        if etl_type == 'Unstructured':
+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
+                                  'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
+        else:
+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
+        if extension.lower() not in allowed_extensions:
            raise UnsupportedFileTypeError()
        elif only_image and extension.lower() not in IMAGE_EXTENSIONS:
            raise UnsupportedFileTypeError()