From 878d13ef4215e07ee8da346254e00843a707a361 Mon Sep 17 00:00:00 2001 From: powerfool Date: Tue, 29 Oct 2024 21:10:18 +0800 Subject: [PATCH] Added OceanBase as an option for the vector store in Dify (#10010) --- .gitignore | 1 + api/.env.example | 8 + api/commands.py | 1 + api/configs/middleware/__init__.py | 2 + .../middleware/vdb/oceanbase_config.py | 35 +++ api/controllers/console/datasets/datasets.py | 2 + .../rag/datasource/vdb/oceanbase/__init__.py | 0 .../vdb/oceanbase/oceanbase_vector.py | 209 ++++++++++++++++++ api/core/rag/datasource/vdb/vector_factory.py | 4 + api/core/rag/datasource/vdb/vector_type.py | 1 + api/poetry.lock | 67 +++++- api/pyproject.toml | 1 + .../vdb/oceanbase/__init__.py | 0 .../vdb/oceanbase/test_oceanbase.py | 71 ++++++ dev/pytest/pytest_vdb.sh | 1 + docker/.env.example | 8 + docker/docker-compose.yaml | 18 ++ 17 files changed, 427 insertions(+), 2 deletions(-) create mode 100644 api/configs/middleware/vdb/oceanbase_config.py create mode 100644 api/core/rag/datasource/vdb/oceanbase/__init__.py create mode 100644 api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py create mode 100644 api/tests/integration_tests/vdb/oceanbase/__init__.py create mode 100644 api/tests/integration_tests/vdb/oceanbase/test_oceanbase.py diff --git a/.gitignore b/.gitignore index 29a80534f7..60b5781733 100644 --- a/.gitignore +++ b/.gitignore @@ -174,6 +174,7 @@ docker/volumes/unstructured/* docker/volumes/pgvector/data/* docker/volumes/pgvecto_rs/data/* docker/volumes/couchbase/* +docker/volumes/oceanbase/* docker/nginx/conf.d/default.conf docker/nginx/ssl/* diff --git a/api/.env.example b/api/.env.example index 2ce425338e..984985803e 100644 --- a/api/.env.example +++ b/api/.env.example @@ -249,6 +249,14 @@ VIKINGDB_SCHEMA=http VIKINGDB_CONNECTION_TIMEOUT=30 VIKINGDB_SOCKET_TIMEOUT=30 +# OceanBase Vector configuration +OCEANBASE_VECTOR_HOST=127.0.0.1 +OCEANBASE_VECTOR_PORT=2881 +OCEANBASE_VECTOR_USER=root@test +OCEANBASE_VECTOR_PASSWORD= +OCEANBASE_VECTOR_DATABASE=test +OCEANBASE_MEMORY_LIMIT=6G + # Upload configuration UPLOAD_FILE_SIZE_LIMIT=15 UPLOAD_FILE_BATCH_LIMIT=5 diff --git a/api/commands.py b/api/commands.py index da09f1b610..10122ceb3d 100644 --- a/api/commands.py +++ b/api/commands.py @@ -279,6 +279,7 @@ def migrate_knowledge_vector_database(): VectorType.VIKINGDB, VectorType.UPSTASH, VectorType.COUCHBASE, + VectorType.OCEANBASE, } page = 1 while True: diff --git a/api/configs/middleware/__init__.py b/api/configs/middleware/__init__.py index e8f6ba91b6..38bb804613 100644 --- a/api/configs/middleware/__init__.py +++ b/api/configs/middleware/__init__.py @@ -21,6 +21,7 @@ from configs.middleware.vdb.couchbase_config import CouchbaseConfig from configs.middleware.vdb.elasticsearch_config import ElasticsearchConfig from configs.middleware.vdb.milvus_config import MilvusConfig from configs.middleware.vdb.myscale_config import MyScaleConfig +from configs.middleware.vdb.oceanbase_config import OceanBaseVectorConfig from configs.middleware.vdb.opensearch_config import OpenSearchConfig from configs.middleware.vdb.oracle_config import OracleConfig from configs.middleware.vdb.pgvector_config import PGVectorConfig @@ -257,5 +258,6 @@ class MiddlewareConfig( VikingDBConfig, UpstashConfig, TidbOnQdrantConfig, + OceanBaseVectorConfig, ): pass diff --git a/api/configs/middleware/vdb/oceanbase_config.py b/api/configs/middleware/vdb/oceanbase_config.py new file mode 100644 index 0000000000..87427af960 --- /dev/null +++ b/api/configs/middleware/vdb/oceanbase_config.py @@ -0,0 +1,35 @@ +from typing import Optional + +from pydantic import Field, PositiveInt +from pydantic_settings import BaseSettings + + +class OceanBaseVectorConfig(BaseSettings): + """ + Configuration settings for OceanBase Vector database + """ + + OCEANBASE_VECTOR_HOST: Optional[str] = Field( + description="Hostname or IP address of the OceanBase Vector server (e.g. 'localhost')", + default=None, + ) + + OCEANBASE_VECTOR_PORT: Optional[PositiveInt] = Field( + description="Port number on which the OceanBase Vector server is listening (default is 2881)", + default=2881, + ) + + OCEANBASE_VECTOR_USER: Optional[str] = Field( + description="Username for authenticating with the OceanBase Vector database", + default=None, + ) + + OCEANBASE_VECTOR_PASSWORD: Optional[str] = Field( + description="Password for authenticating with the OceanBase Vector database", + default=None, + ) + + OCEANBASE_VECTOR_DATABASE: Optional[str] = Field( + description="Name of the OceanBase Vector database to connect to", + default=None, + ) diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index 854821746a..4f4d186edd 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -628,6 +628,7 @@ class DatasetRetrievalSettingApi(Resource): | VectorType.BAIDU | VectorType.VIKINGDB | VectorType.UPSTASH + | VectorType.OCEANBASE ): return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]} case ( @@ -669,6 +670,7 @@ class DatasetRetrievalSettingMockApi(Resource): | VectorType.BAIDU | VectorType.VIKINGDB | VectorType.UPSTASH + | VectorType.OCEANBASE ): return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]} case ( diff --git a/api/core/rag/datasource/vdb/oceanbase/__init__.py b/api/core/rag/datasource/vdb/oceanbase/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py b/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py new file mode 100644 index 0000000000..8dd26a073b --- /dev/null +++ b/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py @@ -0,0 +1,209 @@ +import json +import logging +import math +from typing import Any + +from pydantic import BaseModel, model_validator +from pyobvector import VECTOR, ObVecClient +from sqlalchemy import JSON, Column, String, func +from sqlalchemy.dialects.mysql import LONGTEXT + +from configs import dify_config +from core.rag.datasource.vdb.vector_base import BaseVector +from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory +from core.rag.datasource.vdb.vector_type import VectorType +from core.rag.embedding.embedding_base import Embeddings +from core.rag.models.document import Document +from extensions.ext_redis import redis_client +from models.dataset import Dataset + +logger = logging.getLogger(__name__) + +DEFAULT_OCEANBASE_HNSW_BUILD_PARAM = {"M": 16, "efConstruction": 256} +DEFAULT_OCEANBASE_HNSW_SEARCH_PARAM = {"efSearch": 64} +OCEANBASE_SUPPORTED_VECTOR_INDEX_TYPE = "HNSW" +DEFAULT_OCEANBASE_VECTOR_METRIC_TYPE = "l2" + + +class OceanBaseVectorConfig(BaseModel): + host: str + port: int + user: str + password: str + database: str + + @model_validator(mode="before") + @classmethod + def validate_config(cls, values: dict) -> dict: + if not values["host"]: + raise ValueError("config OCEANBASE_VECTOR_HOST is required") + if not values["port"]: + raise ValueError("config OCEANBASE_VECTOR_PORT is required") + if not values["user"]: + raise ValueError("config OCEANBASE_VECTOR_USER is required") + if not values["database"]: + raise ValueError("config OCEANBASE_VECTOR_DATABASE is required") + return values + + +class OceanBaseVector(BaseVector): + def __init__(self, collection_name: str, config: OceanBaseVectorConfig): + super().__init__(collection_name) + self._config = config + self._hnsw_ef_search = -1 + self._client = ObVecClient( + uri=f"{self._config.host}:{self._config.port}", + user=self._config.user, + password=self._config.password, + db_name=self._config.database, + ) + + def get_type(self) -> str: + return VectorType.OCEANBASE + + def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs): + self._vec_dim = len(embeddings[0]) + self._create_collection() + self.add_texts(texts, embeddings) + + def _create_collection(self) -> None: + lock_name = "vector_indexing_lock_" + self._collection_name + with redis_client.lock(lock_name, timeout=20): + collection_exist_cache_key = "vector_indexing_" + self._collection_name + if redis_client.get(collection_exist_cache_key): + return + + if self._client.check_table_exists(self._collection_name): + return + + self.delete() + + cols = [ + Column("id", String(36), primary_key=True, autoincrement=False), + Column("vector", VECTOR(self._vec_dim)), + Column("text", LONGTEXT), + Column("metadata", JSON), + ] + vidx_params = self._client.prepare_index_params() + vidx_params.add_index( + field_name="vector", + index_type=OCEANBASE_SUPPORTED_VECTOR_INDEX_TYPE, + index_name="vector_index", + metric_type=DEFAULT_OCEANBASE_VECTOR_METRIC_TYPE, + params=DEFAULT_OCEANBASE_HNSW_BUILD_PARAM, + ) + + self._client.create_table_with_index_params( + table_name=self._collection_name, + columns=cols, + vidxs=vidx_params, + ) + vals = [] + params = self._client.perform_raw_text_sql("SHOW PARAMETERS LIKE '%ob_vector_memory_limit_percentage%'") + for row in params: + val = int(row[6]) + vals.append(val) + if len(vals) == 0: + print("ob_vector_memory_limit_percentage not found in parameters.") + exit(1) + if any(val == 0 for val in vals): + try: + self._client.perform_raw_text_sql("ALTER SYSTEM SET ob_vector_memory_limit_percentage = 30") + except Exception as e: + raise Exception( + "Failed to set ob_vector_memory_limit_percentage. " + + "Maybe the database user has insufficient privilege.", + e, + ) + redis_client.set(collection_exist_cache_key, 1, ex=3600) + + def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): + ids = self._get_uuids(documents) + for id, doc, emb in zip(ids, documents, embeddings): + self._client.insert( + table_name=self._collection_name, + data={ + "id": id, + "vector": emb, + "text": doc.page_content, + "metadata": doc.metadata, + }, + ) + + def text_exists(self, id: str) -> bool: + cur = self._client.get(table_name=self._collection_name, id=id) + return cur.rowcount != 0 + + def delete_by_ids(self, ids: list[str]) -> None: + self._client.delete(table_name=self._collection_name, ids=ids) + + def get_ids_by_metadata_field(self, key: str, value: str) -> list[str]: + cur = self._client.get( + table_name=self._collection_name, + where_clause=f"metadata->>'$.{key}' = '{value}'", + output_column_name=["id"], + ) + return [row[0] for row in cur] + + def delete_by_metadata_field(self, key: str, value: str) -> None: + ids = self.get_ids_by_metadata_field(key, value) + self.delete_by_ids(ids) + + def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: + return [] + + def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: + ef_search = kwargs.get("ef_search", self._hnsw_ef_search) + if ef_search != self._hnsw_ef_search: + self._client.set_ob_hnsw_ef_search(ef_search) + self._hnsw_ef_search = ef_search + topk = kwargs.get("top_k", 10) + cur = self._client.ann_search( + table_name=self._collection_name, + vec_column_name="vector", + vec_data=query_vector, + topk=topk, + distance_func=func.l2_distance, + output_column_names=["text", "metadata"], + with_dist=True, + ) + docs = [] + for text, metadata, distance in cur: + metadata = json.loads(metadata) + metadata["score"] = 1 - distance / math.sqrt(2) + docs.append( + Document( + page_content=text, + metadata=metadata, + ) + ) + return docs + + def delete(self) -> None: + self._client.drop_table_if_exist(self._collection_name) + + +class OceanBaseVectorFactory(AbstractVectorFactory): + def init_vector( + self, + dataset: Dataset, + attributes: list, + embeddings: Embeddings, + ) -> BaseVector: + if dataset.index_struct_dict: + class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] + collection_name = class_prefix.lower() + else: + dataset_id = dataset.id + collection_name = Dataset.gen_collection_name_by_id(dataset_id).lower() + dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.OCEANBASE, collection_name)) + return OceanBaseVector( + collection_name, + OceanBaseVectorConfig( + host=dify_config.OCEANBASE_VECTOR_HOST, + port=dify_config.OCEANBASE_VECTOR_PORT, + user=dify_config.OCEANBASE_VECTOR_USER, + password=(dify_config.OCEANBASE_VECTOR_PASSWORD or ""), + database=dify_config.OCEANBASE_VECTOR_DATABASE, + ), + ) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 87d19bf60b..c8cb007ae8 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -134,6 +134,10 @@ class Vector: from core.rag.datasource.vdb.tidb_on_qdrant.tidb_on_qdrant_vector import TidbOnQdrantVectorFactory return TidbOnQdrantVectorFactory + case VectorType.OCEANBASE: + from core.rag.datasource.vdb.oceanbase.oceanbase_vector import OceanBaseVectorFactory + + return OceanBaseVectorFactory case _: raise ValueError(f"Vector store {vector_type} is not supported.") diff --git a/api/core/rag/datasource/vdb/vector_type.py b/api/core/rag/datasource/vdb/vector_type.py index 7384c12ff7..e3b37ece88 100644 --- a/api/core/rag/datasource/vdb/vector_type.py +++ b/api/core/rag/datasource/vdb/vector_type.py @@ -21,3 +21,4 @@ class VectorType(str, Enum): VIKINGDB = "vikingdb" UPSTASH = "upstash" TIDB_ON_QDRANT = "tidb_on_qdrant" + OCEANBASE = "oceanbase" diff --git a/api/poetry.lock b/api/poetry.lock index e1e5a6410b..5b581b9965 100644 --- a/api/poetry.lock +++ b/api/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -932,6 +932,10 @@ files = [ {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a37b8f0391212d29b3a91a799c8e4a2855e0576911cdfb2515487e30e322253d"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e84799f09591700a4154154cab9787452925578841a94321d5ee8fb9a9a328f0"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f66b5337fa213f1da0d9000bc8dc0cb5b896b726eefd9c6046f699b169c41b9e"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5dab0844f2cf82be357a0eb11a9087f70c5430b2c241493fc122bb6f2bb0917c"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4fe605b917c70283db7dfe5ada75e04561479075761a0b3866c081d035b01c1"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1e9a65b5736232e7a7f91ff3d02277f11d339bf34099a56cdab6a8b3410a02b2"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58d4b711689366d4a03ac7957ab8c28890415e267f9b6589969e74b6e42225ec"}, {file = "Brotli-1.1.0-cp310-cp310-win32.whl", hash = "sha256:be36e3d172dc816333f33520154d708a2657ea63762ec16b62ece02ab5e4daf2"}, {file = "Brotli-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:0c6244521dda65ea562d5a69b9a26120769b7a9fb3db2fe9545935ed6735b128"}, {file = "Brotli-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3daabb76a78f829cafc365531c972016e4aa8d5b4bf60660ad8ecee19df7ccc"}, @@ -944,8 +948,14 @@ files = [ {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:19c116e796420b0cee3da1ccec3b764ed2952ccfcc298b55a10e5610ad7885f9"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:510b5b1bfbe20e1a7b3baf5fed9e9451873559a976c1a78eebaa3b86c57b4265"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a1fd8a29719ccce974d523580987b7f8229aeace506952fa9ce1d53a033873c8"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c247dd99d39e0338a604f8c2b3bc7061d5c2e9e2ac7ba9cc1be5a69cb6cd832f"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1b2c248cd517c222d89e74669a4adfa5577e06ab68771a529060cf5a156e9757"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2a24c50840d89ded6c9a8fdc7b6ed3692ed4e86f1c4a4a938e1e92def92933e0"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f31859074d57b4639318523d6ffdca586ace54271a73ad23ad021acd807eb14b"}, {file = "Brotli-1.1.0-cp311-cp311-win32.whl", hash = "sha256:39da8adedf6942d76dc3e46653e52df937a3c4d6d18fdc94a7c29d263b1f5b50"}, {file = "Brotli-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:aac0411d20e345dc0920bdec5548e438e999ff68d77564d5e9463a7ca9d3e7b1"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:32d95b80260d79926f5fab3c41701dbb818fde1c9da590e77e571eefd14abe28"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b760c65308ff1e462f65d69c12e4ae085cff3b332d894637f6273a12a482d09f"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:316cc9b17edf613ac76b1f1f305d2a748f1b976b033b049a6ecdfd5612c70409"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:caf9ee9a5775f3111642d33b86237b05808dafcd6268faa492250e9b78046eb2"}, {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70051525001750221daa10907c77830bc889cb6d865cc0b813d9db7fefc21451"}, @@ -956,8 +966,24 @@ files = [ {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4093c631e96fdd49e0377a9c167bfd75b6d0bad2ace734c6eb20b348bc3ea180"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e4c4629ddad63006efa0ef968c8e4751c5868ff0b1c5c40f76524e894c50248"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:861bf317735688269936f755fa136a99d1ed526883859f86e41a5d43c61d8966"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:87a3044c3a35055527ac75e419dfa9f4f3667a1e887ee80360589eb8c90aabb9"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c5529b34c1c9d937168297f2c1fde7ebe9ebdd5e121297ff9c043bdb2ae3d6fb"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ca63e1890ede90b2e4454f9a65135a4d387a4585ff8282bb72964fab893f2111"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e79e6520141d792237c70bcd7a3b122d00f2613769ae0cb61c52e89fd3443839"}, {file = "Brotli-1.1.0-cp312-cp312-win32.whl", hash = "sha256:5f4d5ea15c9382135076d2fb28dde923352fe02951e66935a9efaac8f10e81b0"}, {file = "Brotli-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:906bc3a79de8c4ae5b86d3d75a8b77e44404b0f4261714306e3ad248d8ab0951"}, + {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8bf32b98b75c13ec7cf774164172683d6e7891088f6316e54425fde1efc276d5"}, + {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bc37c4d6b87fb1017ea28c9508b36bbcb0c3d18b4260fcdf08b200c74a6aee8"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c0ef38c7a7014ffac184db9e04debe495d317cc9c6fb10071f7fefd93100a4f"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91d7cc2a76b5567591d12c01f019dd7afce6ba8cba6571187e21e2fc418ae648"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93dde851926f4f2678e704fadeb39e16c35d8baebd5252c9fd94ce8ce68c4a0"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0db75f47be8b8abc8d9e31bc7aad0547ca26f24a54e6fd10231d623f183d089"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6967ced6730aed543b8673008b5a391c3b1076d834ca438bbd70635c73775368"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7eedaa5d036d9336c95915035fb57422054014ebdeb6f3b42eac809928e40d0c"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d487f5432bf35b60ed625d7e1b448e2dc855422e87469e3f450aa5552b0eb284"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7"}, + {file = "Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0"}, + {file = "Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b"}, {file = "Brotli-1.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a090ca607cbb6a34b0391776f0cb48062081f5f60ddcce5d11838e67a01928d1"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de9d02f5bda03d27ede52e8cfe7b865b066fa49258cbab568720aa5be80a47d"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2333e30a5e00fe0fe55903c8832e08ee9c3b1382aacf4db26664a16528d51b4b"}, @@ -967,6 +993,10 @@ files = [ {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:069a121ac97412d1fe506da790b3e69f52254b9df4eb665cd42460c837193354"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e93dfc1a1165e385cc8239fab7c036fb2cd8093728cbd85097b284d7b99249a2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:aea440a510e14e818e67bfc4027880e2fb500c2ccb20ab21c7a7c8b5b4703d75"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:6974f52a02321b36847cd19d1b8e381bf39939c21efd6ee2fc13a28b0d99348c"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:a7e53012d2853a07a4a79c00643832161a910674a893d296c9f1259859a289d2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:d7702622a8b40c49bffb46e1e3ba2e81268d5c04a34f460978c6b5517a34dd52"}, {file = "Brotli-1.1.0-cp36-cp36m-win32.whl", hash = "sha256:a599669fd7c47233438a56936988a2478685e74854088ef5293802123b5b2460"}, {file = "Brotli-1.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d143fd47fad1db3d7c27a1b1d66162e855b5d50a89666af46e1679c496e8e579"}, {file = "Brotli-1.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:11d00ed0a83fa22d29bc6b64ef636c4552ebafcef57154b4ddd132f5638fbd1c"}, @@ -978,6 +1008,10 @@ files = [ {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:919e32f147ae93a09fe064d77d5ebf4e35502a8df75c29fb05788528e330fe74"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:23032ae55523cc7bccb4f6a0bf368cd25ad9bcdcc1990b64a647e7bbcce9cb5b"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:224e57f6eac61cc449f498cc5f0e1725ba2071a3d4f48d5d9dffba42db196438"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cb1dac1770878ade83f2ccdf7d25e494f05c9165f5246b46a621cc849341dc01"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:3ee8a80d67a4334482d9712b8e83ca6b1d9bc7e351931252ebef5d8f7335a547"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5e55da2c8724191e5b557f8e18943b1b4839b8efc3ef60d65985bcf6f587dd38"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:d342778ef319e1026af243ed0a07c97acf3bad33b9f29e7ae6a1f68fd083e90c"}, {file = "Brotli-1.1.0-cp37-cp37m-win32.whl", hash = "sha256:587ca6d3cef6e4e868102672d3bd9dc9698c309ba56d41c2b9c85bbb903cdb95"}, {file = "Brotli-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2954c1c23f81c2eaf0b0717d9380bd348578a94161a65b3a2afc62c86467dd68"}, {file = "Brotli-1.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:efa8b278894b14d6da122a72fefcebc28445f2d3f880ac59d46c90f4c13be9a3"}, @@ -990,6 +1024,10 @@ files = [ {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ab4fbee0b2d9098c74f3057b2bc055a8bd92ccf02f65944a241b4349229185a"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:141bd4d93984070e097521ed07e2575b46f817d08f9fa42b16b9b5f27b5ac088"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fce1473f3ccc4187f75b4690cfc922628aed4d3dd013d047f95a9b3919a86596"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d2b35ca2c7f81d173d2fadc2f4f31e88cc5f7a39ae5b6db5513cf3383b0e0ec7"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:af6fa6817889314555aede9a919612b23739395ce767fe7fcbea9a80bf140fe5"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2feb1d960f760a575dbc5ab3b1c00504b24caaf6986e2dc2b01c09c87866a943"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4410f84b33374409552ac9b6903507cdb31cd30d2501fc5ca13d18f73548444a"}, {file = "Brotli-1.1.0-cp38-cp38-win32.whl", hash = "sha256:db85ecf4e609a48f4b29055f1e144231b90edc90af7481aa731ba2d059226b1b"}, {file = "Brotli-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3d7954194c36e304e1523f55d7042c59dc53ec20dd4e9ea9d151f1b62b4415c0"}, {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a"}, @@ -1002,6 +1040,10 @@ files = [ {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb"}, {file = "Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64"}, {file = "Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467"}, {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"}, @@ -7269,6 +7311,22 @@ files = [ ed25519 = ["PyNaCl (>=1.4.0)"] rsa = ["cryptography"] +[[package]] +name = "pyobvector" +version = "0.1.6" +description = "A python SDK for OceanBase Vector Store, based on SQLAlchemy, compatible with Milvus API." +optional = false +python-versions = "<4.0,>=3.9" +files = [ + {file = "pyobvector-0.1.6-py3-none-any.whl", hash = "sha256:0d700e865a85b4716b9a03384189e49288cd9d5f3cef88aed4740bc82d5fd136"}, + {file = "pyobvector-0.1.6.tar.gz", hash = "sha256:05551addcac8c596992d5e38b480c83ca3481c6cfc6f56a1a1bddfb2e6ae037e"}, +] + +[package.dependencies] +numpy = ">=1.26.0,<2.0.0" +pymysql = ">=1.1.1,<2.0.0" +sqlalchemy = ">=2.0.32,<3.0.0" + [[package]] name = "pyopenssl" version = "24.2.1" @@ -8677,6 +8735,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -10919,4 +10982,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "52552faf5f4823056eb48afe05349ab2f0e9a5bc42105211ccbbb54b59e27b59" +content-hash = "ef927b98c33d704d680e08db0e5c7d9a4e05454c66fcd6a5f656a65eb08e886b" diff --git a/api/pyproject.toml b/api/pyproject.toml index a3313f0ff5..ee7cf4d618 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -247,6 +247,7 @@ pgvecto-rs = { version = "~0.2.1", extras = ['sqlalchemy'] } pgvector = "0.2.5" pymilvus = "~2.4.4" pymochow = "1.3.1" +pyobvector = "~0.1.6" qdrant-client = "1.7.3" tcvectordb = "1.3.2" tidb-vector = "0.0.9" diff --git a/api/tests/integration_tests/vdb/oceanbase/__init__.py b/api/tests/integration_tests/vdb/oceanbase/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/tests/integration_tests/vdb/oceanbase/test_oceanbase.py b/api/tests/integration_tests/vdb/oceanbase/test_oceanbase.py new file mode 100644 index 0000000000..ebcb134168 --- /dev/null +++ b/api/tests/integration_tests/vdb/oceanbase/test_oceanbase.py @@ -0,0 +1,71 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from core.rag.datasource.vdb.oceanbase.oceanbase_vector import ( + OceanBaseVector, + OceanBaseVectorConfig, +) +from tests.integration_tests.vdb.__mock.tcvectordb import setup_tcvectordb_mock +from tests.integration_tests.vdb.test_vector_store import ( + AbstractVectorTest, + get_example_text, + setup_mock_redis, +) + + +@pytest.fixture +def oceanbase_vector(): + return OceanBaseVector( + "dify_test_collection", + config=OceanBaseVectorConfig( + host="127.0.0.1", + port="2881", + user="root@test", + database="test", + password="test", + ), + ) + + +class OceanBaseVectorTest(AbstractVectorTest): + def __init__(self, vector: OceanBaseVector): + super().__init__() + self.vector = vector + + def search_by_vector(self): + hits_by_vector = self.vector.search_by_vector(query_vector=self.example_embedding) + assert len(hits_by_vector) == 0 + + def search_by_full_text(self): + hits_by_full_text = self.vector.search_by_full_text(query=get_example_text()) + assert len(hits_by_full_text) == 0 + + def text_exists(self): + exist = self.vector.text_exists(self.example_doc_id) + assert exist == True + + def get_ids_by_metadata_field(self): + ids = self.vector.get_ids_by_metadata_field(key="document_id", value=self.example_doc_id) + assert len(ids) == 0 + + +@pytest.fixture +def setup_mock_oceanbase_client(): + with patch("core.rag.datasource.vdb.oceanbase.oceanbase_vector.ObVecClient", new_callable=MagicMock) as mock_client: + yield mock_client + + +@pytest.fixture +def setup_mock_oceanbase_vector(oceanbase_vector): + with patch.object(oceanbase_vector, "_client"): + yield oceanbase_vector + + +def test_oceanbase_vector( + setup_mock_redis, + setup_mock_oceanbase_client, + setup_mock_oceanbase_vector, + oceanbase_vector, +): + OceanBaseVectorTest(oceanbase_vector).run_all_tests() diff --git a/dev/pytest/pytest_vdb.sh b/dev/pytest/pytest_vdb.sh index 418a129693..02a9f49279 100755 --- a/dev/pytest/pytest_vdb.sh +++ b/dev/pytest/pytest_vdb.sh @@ -13,3 +13,4 @@ pytest api/tests/integration_tests/vdb/chroma \ api/tests/integration_tests/vdb/tcvectordb \ api/tests/integration_tests/vdb/upstash \ api/tests/integration_tests/vdb/couchbase \ + api/tests/integration_tests/vdb/oceanbase \ diff --git a/docker/.env.example b/docker/.env.example index c506a9d92e..db99701c52 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -517,6 +517,14 @@ VIKINGDB_SCHEMA=http VIKINGDB_CONNECTION_TIMEOUT=30 VIKINGDB_SOCKET_TIMEOUT=30 +# OceanBase Vector configuration, only available when VECTOR_STORE is `oceanbase` +OCEANBASE_VECTOR_HOST=oceanbase-vector +OCEANBASE_VECTOR_PORT=2881 +OCEANBASE_VECTOR_USER=root@test +OCEANBASE_VECTOR_PASSWORD= +OCEANBASE_VECTOR_DATABASE=test +OCEANBASE_MEMORY_LIMIT=6G + # ------------------------------ # Knowledge Configuration # ------------------------------ diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 2a756ead9c..28a70b0f56 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -243,6 +243,12 @@ x-shared-env: &shared-api-worker-env POSITION_PROVIDER_INCLUDES: ${POSITION_PROVIDER_INCLUDES:-} POSITION_PROVIDER_EXCLUDES: ${POSITION_PROVIDER_EXCLUDES:-} MAX_VARIABLE_SIZE: ${MAX_VARIABLE_SIZE:-204800} + OCEANBASE_VECTOR_HOST: ${OCEANBASE_VECTOR_HOST:-http://oceanbase-vector} + OCEANBASE_VECTOR_PORT: ${OCEANBASE_VECTOR_PORT:-2881} + OCEANBASE_VECTOR_USER: ${OCEANBASE_VECTOR_USER:-root@test} + OCEANBASE_VECTOR_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-""} + OCEANBASE_VECTOR_DATABASE: ${OCEANBASE_VECTOR_DATABASE:-test} + OCEANBASE_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G} services: # API service @@ -570,6 +576,18 @@ services: CHROMA_SERVER_AUTHN_PROVIDER: ${CHROMA_SERVER_AUTHN_PROVIDER:-chromadb.auth.token_authn.TokenAuthenticationServerProvider} IS_PERSISTENT: ${CHROMA_IS_PERSISTENT:-TRUE} + # OceanBase vector database + oceanbase-vector: + image: quay.io/oceanbase/oceanbase-ce:4.3.3.0-100000142024101215 + profiles: + - oceanbase-vector + restart: always + volumes: + - ./volumes/oceanbase/data:/root/ob + - ./volumes/oceanbase/conf:/root/.obd/cluster + environment: + OB_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G} + # Oracle vector database oracle: image: container-registry.oracle.com/database/free:latest