Feat/add unstructured support (#1780)

Co-authored-by: jyong <jyong@dify.ai>
2024-11-16 11:42:29 +08:00 · 2023-12-18 23:24:06 +08:00 · 2023-12-18 23:24:06 +08:00 · 5e34f938c1
commit 5e34f938c1
parent 2fd56cb01c
15 changed files with 361 additions and 14 deletions
--- a/api/.env.example
+++ b/api/.env.example
@ -117,3 +117,6 @@ HOSTED_ANTHROPIC_API_BASE=
 HOSTED_ANTHROPIC_API_KEY=
 HOSTED_ANTHROPIC_QUOTA_LIMIT=600000
 HOSTED_ANTHROPIC_PAID_ENABLED=false
 ETL_TYPE=dify
 UNSTRUCTURED_API_URL=
--- a/api/config.py
+++ b/api/config.py
@ -54,7 +54,8 @@ DEFAULTS = {
    'UPLOAD_IMAGE_FILE_SIZE_LIMIT': 10,
    'OUTPUT_MODERATION_BUFFER_SIZE': 300,
    'MULTIMODAL_SEND_IMAGE_FORMAT': 'base64',
-    'INVITE_EXPIRY_HOURS': 72
+    'INVITE_EXPIRY_HOURS': 72,
    'ETL_TYPE': 'dify',
 }
@ -276,6 +277,9 @@ class Config:
        self.HOSTED_MODERATION_ENABLED = get_bool_env('HOSTED_MODERATION_ENABLED')
        self.HOSTED_MODERATION_PROVIDERS = get_env('HOSTED_MODERATION_PROVIDERS')
        self.ETL_TYPE = get_env('ETL_TYPE')
        self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL')
 class CloudEditionConfig(Config):
--- a/api/controllers/console/datasets/file.py
+++ b/api/controllers/console/datasets/file.py
@ -69,5 +69,20 @@ class FilePreviewApi(Resource):
        return {'content': text}
 class FileeSupportTypApi(Resource):
    @setup_required
    @login_required
    @account_initialization_required
    def get(self):
        etl_type = current_app.config['ETL_TYPE']
        if etl_type == 'Unstructured':
            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
                                  'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
        else:
            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
        return {'allowed_extensions': allowed_extensions}
 api.add_resource(FileApi, '/files/upload')
 api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')
 api.add_resource(FileeSupportTypApi, '/files/support-type')
--- a/api/core/data_loader/file_extractor.py
+++ b/api/core/data_loader/file_extractor.py
@ -3,7 +3,8 @@ from pathlib import Path
 from typing import List, Union, Optional
 import requests
-from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader
+from flask import current_app
 from langchain.document_loaders import TextLoader, Docx2txtLoader
 from langchain.schema import Document
 from core.data_loader.loader.csv_loader import CSVLoader
@ -11,6 +12,13 @@ from core.data_loader.loader.excel import ExcelLoader
 from core.data_loader.loader.html import HTMLLoader
 from core.data_loader.loader.markdown import MarkdownLoader
 from core.data_loader.loader.pdf import PdfLoader
 from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader
 from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader
 from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader
 from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader
 from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader
 from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader
 from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader
 from extensions.ext_storage import storage
 from models.model import UploadFile
@ -49,14 +57,34 @@ class FileExtractor:
        input_file = Path(file_path)
        delimiter = '\n'
        file_extension = input_file.suffix.lower()
-        if is_automatic:
+        etl_type = current_app.config['ETL_TYPE']
-            loader = UnstructuredFileLoader(
+        unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
-                file_path, strategy="hi_res", mode="elements"
+        if etl_type == 'Unstructured':
-            )
+            if file_extension == '.xlsx':
-            # loader = UnstructuredAPIFileLoader(
+                loader = ExcelLoader(file_path)
-            #     file_path=filenames[0],
+            elif file_extension == '.pdf':
-            #     api_key="FAKE_API_KEY",
+                loader = PdfLoader(file_path, upload_file=upload_file)
-            # )
+            elif file_extension in ['.md', '.markdown']:
                loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url)
            elif file_extension in ['.htm', '.html']:
                loader = HTMLLoader(file_path)
            elif file_extension == '.docx':
                loader = Docx2txtLoader(file_path)
            elif file_extension == '.csv':
                loader = CSVLoader(file_path, autodetect_encoding=True)
            elif file_extension == '.msg':
                loader = UnstructuredMsgLoader(file_path, unstructured_api_url)
            elif file_extension == '.eml':
                loader = UnstructuredEmailLoader(file_path, unstructured_api_url)
            elif file_extension == '.ppt':
                loader = UnstructuredPPTLoader(file_path, unstructured_api_url)
            elif file_extension == '.pptx':
                loader = UnstructuredPPTXLoader(file_path, unstructured_api_url)
            elif file_extension == '.xml':
                loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
            else:
                # txt
                loader = UnstructuredTextLoader(file_path, unstructured_api_url)
        else:
            if file_extension == '.xlsx':
                loader = ExcelLoader(file_path)
--- a/api/core/data_loader/loader/unstructured/unstructured_eml.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_eml.py
@ -0,0 +1,41 @@
 import logging
 import re
 from typing import Optional, List, Tuple, cast
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.helpers import detect_file_encodings
 from langchain.schema import Document
 logger = logging.getLogger(__name__)
 class UnstructuredEmailLoader(BaseLoader):
    """Load msg files.
    Args:
        file_path: Path to the file to load.
    """
    def __init__(
        self,
        file_path: str,
        api_url: str,
    ):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
    def load(self) -> List[Document]:
        from unstructured.partition.email import partition_email
        elements = partition_email(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
            documents.append(Document(page_content=text))
        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_markdown.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_markdown.py
@ -0,0 +1,48 @@
 import logging
 from typing import List
 from langchain.document_loaders.base import BaseLoader
 from langchain.schema import Document
 logger = logging.getLogger(__name__)
 class UnstructuredMarkdownLoader(BaseLoader):
    """Load md files.
    Args:
        file_path: Path to the file to load.
        remove_hyperlinks: Whether to remove hyperlinks from the text.
        remove_images: Whether to remove images from the text.
        encoding: File encoding to use. If `None`, the file will be loaded
        with the default system encoding.
        autodetect_encoding: Whether to try to autodetect the file encoding
            if the specified encoding fails.
    """
    def __init__(
        self,
        file_path: str,
        api_url: str,
    ):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
    def load(self) -> List[Document]:
        from unstructured.partition.md import partition_md
        elements = partition_md(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
            documents.append(Document(page_content=text))
        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_msg.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_msg.py
@ -0,0 +1,40 @@
 import logging
 import re
 from typing import Optional, List, Tuple, cast
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.helpers import detect_file_encodings
 from langchain.schema import Document
 logger = logging.getLogger(__name__)
 class UnstructuredMsgLoader(BaseLoader):
    """Load msg files.
    Args:
        file_path: Path to the file to load.
    """
    def __init__(
        self,
        file_path: str,
        api_url: str
    ):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
    def load(self) -> List[Document]:
        from unstructured.partition.msg import partition_msg
        elements = partition_msg(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
            documents.append(Document(page_content=text))
        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_ppt.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_ppt.py
@ -0,0 +1,40 @@
 import logging
 import re
 from typing import Optional, List, Tuple, cast
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.helpers import detect_file_encodings
 from langchain.schema import Document
 logger = logging.getLogger(__name__)
 class UnstructuredPPTLoader(BaseLoader):
    """Load msg files.
    Args:
        file_path: Path to the file to load.
    """
    def __init__(
        self,
        file_path: str,
        api_url: str
    ):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
    def load(self) -> List[Document]:
        from unstructured.partition.ppt import partition_ppt
        elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
            documents.append(Document(page_content=text))
        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_pptx.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_pptx.py
@ -0,0 +1,40 @@
 import logging
 import re
 from typing import Optional, List, Tuple, cast
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.helpers import detect_file_encodings
 from langchain.schema import Document
 logger = logging.getLogger(__name__)
 class UnstructuredPPTXLoader(BaseLoader):
    """Load msg files.
    Args:
        file_path: Path to the file to load.
    """
    def __init__(
        self,
        file_path: str,
        api_url: str
    ):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
    def load(self) -> List[Document]:
        from unstructured.partition.pptx import partition_pptx
        elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
            documents.append(Document(page_content=text))
        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_text.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_text.py
@ -0,0 +1,40 @@
 import logging
 import re
 from typing import Optional, List, Tuple, cast
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.helpers import detect_file_encodings
 from langchain.schema import Document
 logger = logging.getLogger(__name__)
 class UnstructuredTextLoader(BaseLoader):
    """Load msg files.
    Args:
        file_path: Path to the file to load.
    """
    def __init__(
        self,
        file_path: str,
        api_url: str
    ):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
    def load(self) -> List[Document]:
        from unstructured.partition.text import partition_text
        elements = partition_text(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
            documents.append(Document(page_content=text))
        return documents
--- a/api/core/data_loader/loader/unstructured/unstructured_xml.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_xml.py
@ -0,0 +1,40 @@
 import logging
 import re
 from typing import Optional, List, Tuple, cast
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.helpers import detect_file_encodings
 from langchain.schema import Document
 logger = logging.getLogger(__name__)
 class UnstructuredXmlLoader(BaseLoader):
    """Load msg files.
    Args:
        file_path: Path to the file to load.
    """
    def __init__(
        self,
        file_path: str,
        api_url: str
    ):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
    def load(self) -> List[Document]:
        from unstructured.partition.xml import partition_xml
        elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
            documents.append(Document(page_content=text))
        return documents
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@ -397,7 +397,7 @@ class IndexingRunner:
                one_or_none()
            if file_detail:
-                text_docs = FileExtractor.load(file_detail, is_automatic=False)
+                text_docs = FileExtractor.load(file_detail, is_automatic=True)
        elif dataset_document.data_source_type == 'notion_import':
            loader = NotionLoader.from_document(dataset_document)
            text_docs = loader.load()
--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@ -135,7 +135,7 @@ class DatasetProcessRule(db.Model):
        ],
        'segmentation': {
            'delimiter': '\n',
-            'max_tokens': 512
+            'max_tokens': 1000
        }
    }
--- a/api/requirements.txt
+++ b/api/requirements.txt
@ -53,4 +53,6 @@ zhipuai==1.0.7
 werkzeug==2.3.7
 pymilvus==2.3.0
 qdrant-client==1.6.4
-cohere~=4.32
+cohere~=4.32
 unstructured~=0.10.27
 unstructured[docx,pptx]~=0.10.27
--- a/api/services/file_service.py
+++ b/api/services/file_service.py
@ -27,7 +27,13 @@ class FileService:
    @staticmethod
    def upload_file(file: FileStorage, user: Union[Account, EndUser], only_image: bool = False) -> UploadFile:
        extension = file.filename.split('.')[-1]
-        if extension.lower() not in ALLOWED_EXTENSIONS:
+        etl_type = current_app.config['ETL_TYPE']
        if etl_type == 'Unstructured':
            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
                                  'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
        else:
            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
        if extension.lower() not in allowed_extensions:
            raise UnsupportedFileTypeError()
        elif only_image and extension.lower() not in IMAGE_EXTENSIONS:
            raise UnsupportedFileTypeError()