From c8ef9223e5a671d29ac8cfd0f79aea91ddf4354a Mon Sep 17 00:00:00 2001 From: roadgoat19 <123213767+roadgoat19@users.noreply.github.com> Date: Tue, 29 Oct 2024 03:00:23 -0400 Subject: [PATCH] feat: couchbase integration (#6165) Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: Elliot Scribner Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: Bowen Liang --- .github/workflows/api-tests.yml | 3 +- .github/workflows/expose_service_ports.sh | 4 +- .gitignore | 3 +- api/.env.example | 9 +- api/commands.py | 1 + api/configs/middleware/__init__.py | 2 + .../middleware/vdb/couchbase_config.py | 34 ++ api/controllers/console/datasets/datasets.py | 2 + .../rag/datasource/vdb/couchbase/__init__.py | 0 .../vdb/couchbase/couchbase_vector.py | 378 ++++++++++++++++++ api/core/rag/datasource/vdb/vector_factory.py | 4 + api/core/rag/datasource/vdb/vector_type.py | 1 + api/poetry.lock | 55 ++- api/pyproject.toml | 1 + .../vdb/couchbase/__init__.py | 0 .../vdb/couchbase/test_couchbase.py | 50 +++ dev/pytest/pytest_vdb.sh | 3 +- docker/.env.example | 10 +- docker/couchbase-server/Dockerfile | 4 + docker/couchbase-server/init-cbserver.sh | 44 ++ docker/docker-compose.yaml | 38 ++ 21 files changed, 639 insertions(+), 7 deletions(-) create mode 100644 api/configs/middleware/vdb/couchbase_config.py create mode 100644 api/core/rag/datasource/vdb/couchbase/__init__.py create mode 100644 api/core/rag/datasource/vdb/couchbase/couchbase_vector.py create mode 100644 api/tests/integration_tests/vdb/couchbase/__init__.py create mode 100644 api/tests/integration_tests/vdb/couchbase/test_couchbase.py create mode 100644 docker/couchbase-server/Dockerfile create mode 100755 docker/couchbase-server/init-cbserver.sh diff --git a/.github/workflows/api-tests.yml b/.github/workflows/api-tests.yml index e9c2b7b086..c87d5a4dd4 100644 --- a/.github/workflows/api-tests.yml +++ b/.github/workflows/api-tests.yml @@ -78,7 +78,7 @@ jobs: - name: Run Workflow run: poetry run -C api bash dev/pytest/pytest_workflow.sh - - name: Set up Vector Stores (Weaviate, Qdrant, PGVector, Milvus, PgVecto-RS, Chroma, MyScale, ElasticSearch) + - name: Set up Vector Stores (Weaviate, Qdrant, PGVector, Milvus, PgVecto-RS, Chroma, MyScale, ElasticSearch, Couchbase) uses: hoverkraft-tech/compose-action@v2.0.0 with: compose-file: | @@ -86,6 +86,7 @@ jobs: services: | weaviate qdrant + couchbase-server etcd minio milvus-standalone diff --git a/.github/workflows/expose_service_ports.sh b/.github/workflows/expose_service_ports.sh index ae3e0ee69d..bc65c19a91 100755 --- a/.github/workflows/expose_service_ports.sh +++ b/.github/workflows/expose_service_ports.sh @@ -7,5 +7,7 @@ yq eval '.services["milvus-standalone"].ports += ["19530:19530"]' -i docker/dock yq eval '.services.pgvector.ports += ["5433:5432"]' -i docker/docker-compose.yaml yq eval '.services["pgvecto-rs"].ports += ["5431:5432"]' -i docker/docker-compose.yaml yq eval '.services["elasticsearch"].ports += ["9200:9200"]' -i docker/docker-compose.yaml +yq eval '.services.couchbase-server.ports += ["8091-8096:8091-8096"]' -i docker/docker-compose.yaml +yq eval '.services.couchbase-server.ports += ["11210:11210"]' -i docker/docker-compose.yaml -echo "Ports exposed for sandbox, weaviate, qdrant, chroma, milvus, pgvector, pgvecto-rs, elasticsearch" \ No newline at end of file +echo "Ports exposed for sandbox, weaviate, qdrant, chroma, milvus, pgvector, pgvecto-rs, elasticsearch, couchbase" diff --git a/.gitignore b/.gitignore index 27cf8a4ba3..29a80534f7 100644 --- a/.gitignore +++ b/.gitignore @@ -173,6 +173,7 @@ docker/volumes/myscale/log/* docker/volumes/unstructured/* docker/volumes/pgvector/data/* docker/volumes/pgvecto_rs/data/* +docker/volumes/couchbase/* docker/nginx/conf.d/default.conf docker/nginx/ssl/* @@ -189,4 +190,4 @@ pyrightconfig.json api/.vscode .idea/ -.vscode \ No newline at end of file +.vscode diff --git a/api/.env.example b/api/.env.example index 184a811c51..2ce425338e 100644 --- a/api/.env.example +++ b/api/.env.example @@ -120,7 +120,7 @@ SUPABASE_URL=your-server-url WEB_API_CORS_ALLOW_ORIGINS=http://127.0.0.1:3000,* CONSOLE_CORS_ALLOW_ORIGINS=http://127.0.0.1:3000,* -# Vector database configuration, support: weaviate, qdrant, milvus, myscale, relyt, pgvecto_rs, pgvector, pgvector, chroma, opensearch, tidb_vector, vikingdb, upstash +# Vector database configuration, support: weaviate, qdrant, milvus, myscale, relyt, pgvecto_rs, pgvector, pgvector, chroma, opensearch, tidb_vector, couchbase, vikingdb, upstash VECTOR_STORE=weaviate # Weaviate configuration @@ -136,6 +136,13 @@ QDRANT_CLIENT_TIMEOUT=20 QDRANT_GRPC_ENABLED=false QDRANT_GRPC_PORT=6334 +#Couchbase configuration +COUCHBASE_CONNECTION_STRING=127.0.0.1 +COUCHBASE_USER=Administrator +COUCHBASE_PASSWORD=password +COUCHBASE_BUCKET_NAME=Embeddings +COUCHBASE_SCOPE_NAME=_default + # Milvus configuration MILVUS_URI=http://127.0.0.1:19530 MILVUS_TOKEN= diff --git a/api/commands.py b/api/commands.py index 720a4447da..da09f1b610 100644 --- a/api/commands.py +++ b/api/commands.py @@ -278,6 +278,7 @@ def migrate_knowledge_vector_database(): VectorType.BAIDU, VectorType.VIKINGDB, VectorType.UPSTASH, + VectorType.COUCHBASE, } page = 1 while True: diff --git a/api/configs/middleware/__init__.py b/api/configs/middleware/__init__.py index 3d68e29d0e..e8f6ba91b6 100644 --- a/api/configs/middleware/__init__.py +++ b/api/configs/middleware/__init__.py @@ -17,6 +17,7 @@ from configs.middleware.storage.tencent_cos_storage_config import TencentCloudCO from configs.middleware.storage.volcengine_tos_storage_config import VolcengineTOSStorageConfig from configs.middleware.vdb.analyticdb_config import AnalyticdbConfig from configs.middleware.vdb.chroma_config import ChromaConfig +from configs.middleware.vdb.couchbase_config import CouchbaseConfig from configs.middleware.vdb.elasticsearch_config import ElasticsearchConfig from configs.middleware.vdb.milvus_config import MilvusConfig from configs.middleware.vdb.myscale_config import MyScaleConfig @@ -251,6 +252,7 @@ class MiddlewareConfig( TiDBVectorConfig, WeaviateConfig, ElasticsearchConfig, + CouchbaseConfig, InternalTestConfig, VikingDBConfig, UpstashConfig, diff --git a/api/configs/middleware/vdb/couchbase_config.py b/api/configs/middleware/vdb/couchbase_config.py new file mode 100644 index 0000000000..391089ec6e --- /dev/null +++ b/api/configs/middleware/vdb/couchbase_config.py @@ -0,0 +1,34 @@ +from typing import Optional + +from pydantic import BaseModel, Field + + +class CouchbaseConfig(BaseModel): + """ + Couchbase configs + """ + + COUCHBASE_CONNECTION_STRING: Optional[str] = Field( + description="COUCHBASE connection string", + default=None, + ) + + COUCHBASE_USER: Optional[str] = Field( + description="COUCHBASE user", + default=None, + ) + + COUCHBASE_PASSWORD: Optional[str] = Field( + description="COUCHBASE password", + default=None, + ) + + COUCHBASE_BUCKET_NAME: Optional[str] = Field( + description="COUCHBASE bucket name", + default=None, + ) + + COUCHBASE_SCOPE_NAME: Optional[str] = Field( + description="COUCHBASE scope name", + default=None, + ) diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index c8022efb8b..854821746a 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -640,6 +640,7 @@ class DatasetRetrievalSettingApi(Resource): | VectorType.ELASTICSEARCH | VectorType.PGVECTOR | VectorType.TIDB_ON_QDRANT + | VectorType.COUCHBASE ): return { "retrieval_method": [ @@ -678,6 +679,7 @@ class DatasetRetrievalSettingMockApi(Resource): | VectorType.MYSCALE | VectorType.ORACLE | VectorType.ELASTICSEARCH + | VectorType.COUCHBASE | VectorType.PGVECTOR ): return { diff --git a/api/core/rag/datasource/vdb/couchbase/__init__.py b/api/core/rag/datasource/vdb/couchbase/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py b/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py new file mode 100644 index 0000000000..3f88d2ca2b --- /dev/null +++ b/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py @@ -0,0 +1,378 @@ +import json +import logging +import time +import uuid +from datetime import timedelta +from typing import Any + +from couchbase import search +from couchbase.auth import PasswordAuthenticator +from couchbase.cluster import Cluster +from couchbase.management.search import SearchIndex + +# needed for options -- cluster, timeout, SQL++ (N1QL) query, etc. +from couchbase.options import ClusterOptions, SearchOptions +from couchbase.vector_search import VectorQuery, VectorSearch +from flask import current_app +from pydantic import BaseModel, model_validator + +from core.rag.datasource.vdb.vector_base import BaseVector +from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory +from core.rag.datasource.vdb.vector_type import VectorType +from core.rag.embedding.embedding_base import Embeddings +from core.rag.models.document import Document +from extensions.ext_redis import redis_client +from models.dataset import Dataset + +logger = logging.getLogger(__name__) + + +class CouchbaseConfig(BaseModel): + connection_string: str + user: str + password: str + bucket_name: str + scope_name: str + + @model_validator(mode="before") + @classmethod + def validate_config(cls, values: dict) -> dict: + if not values.get("connection_string"): + raise ValueError("config COUCHBASE_CONNECTION_STRING is required") + if not values.get("user"): + raise ValueError("config COUCHBASE_USER is required") + if not values.get("password"): + raise ValueError("config COUCHBASE_PASSWORD is required") + if not values.get("bucket_name"): + raise ValueError("config COUCHBASE_PASSWORD is required") + if not values.get("scope_name"): + raise ValueError("config COUCHBASE_SCOPE_NAME is required") + return values + + +class CouchbaseVector(BaseVector): + def __init__(self, collection_name: str, config: CouchbaseConfig): + super().__init__(collection_name) + self._client_config = config + + """Connect to couchbase""" + + auth = PasswordAuthenticator(config.user, config.password) + options = ClusterOptions(auth) + self._cluster = Cluster(config.connection_string, options) + self._bucket = self._cluster.bucket(config.bucket_name) + self._scope = self._bucket.scope(config.scope_name) + self._bucket_name = config.bucket_name + self._scope_name = config.scope_name + + # Wait until the cluster is ready for use. + self._cluster.wait_until_ready(timedelta(seconds=5)) + + def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs): + index_id = str(uuid.uuid4()).replace("-", "") + self._create_collection(uuid=index_id, vector_length=len(embeddings[0])) + self.add_texts(texts, embeddings) + + def _create_collection(self, vector_length: int, uuid: str): + lock_name = "vector_indexing_lock_{}".format(self._collection_name) + with redis_client.lock(lock_name, timeout=20): + collection_exist_cache_key = "vector_indexing_{}".format(self._collection_name) + if redis_client.get(collection_exist_cache_key): + return + if self._collection_exists(self._collection_name): + return + manager = self._bucket.collections() + manager.create_collection(self._client_config.scope_name, self._collection_name) + + index_manager = self._scope.search_indexes() + + index_definition = json.loads(""" +{ + "type": "fulltext-index", + "name": "Embeddings._default.Vector_Search", + "uuid": "26d4db528e78b716", + "sourceType": "gocbcore", + "sourceName": "Embeddings", + "sourceUUID": "2242e4a25b4decd6650c9c7b3afa1dbf", + "planParams": { + "maxPartitionsPerPIndex": 1024, + "indexPartitions": 1 + }, + "params": { + "doc_config": { + "docid_prefix_delim": "", + "docid_regexp": "", + "mode": "scope.collection.type_field", + "type_field": "type" + }, + "mapping": { + "analysis": { }, + "default_analyzer": "standard", + "default_datetime_parser": "dateTimeOptional", + "default_field": "_all", + "default_mapping": { + "dynamic": true, + "enabled": true + }, + "default_type": "_default", + "docvalues_dynamic": false, + "index_dynamic": true, + "store_dynamic": true, + "type_field": "_type", + "types": { + "collection_name": { + "dynamic": true, + "enabled": true, + "properties": { + "embedding": { + "dynamic": false, + "enabled": true, + "fields": [ + { + "dims": 1536, + "index": true, + "name": "embedding", + "similarity": "dot_product", + "type": "vector", + "vector_index_optimized_for": "recall" + } + ] + }, + "metadata": { + "dynamic": true, + "enabled": true + }, + "text": { + "dynamic": false, + "enabled": true, + "fields": [ + { + "index": true, + "name": "text", + "store": true, + "type": "text" + } + ] + } + } + } + } + }, + "store": { + "indexType": "scorch", + "segmentVersion": 16 + } + }, + "sourceParams": { } + } +""") + index_definition["name"] = self._collection_name + "_search" + index_definition["uuid"] = uuid + index_definition["params"]["mapping"]["types"]["collection_name"]["properties"]["embedding"]["fields"][0][ + "dims" + ] = vector_length + index_definition["params"]["mapping"]["types"][self._scope_name + "." + self._collection_name] = ( + index_definition["params"]["mapping"]["types"].pop("collection_name") + ) + time.sleep(2) + index_manager.upsert_index( + SearchIndex( + index_definition["name"], + params=index_definition["params"], + source_name=self._bucket_name, + ), + ) + time.sleep(1) + + redis_client.set(collection_exist_cache_key, 1, ex=3600) + + def _collection_exists(self, name: str): + scope_collection_map: dict[str, Any] = {} + + # Get a list of all scopes in the bucket + for scope in self._bucket.collections().get_all_scopes(): + scope_collection_map[scope.name] = [] + + # Get a list of all the collections in the scope + for collection in scope.collections: + scope_collection_map[scope.name].append(collection.name) + + # Check if the collection exists in the scope + return self._collection_name in scope_collection_map[self._scope_name] + + def get_type(self) -> str: + return VectorType.COUCHBASE + + def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): + uuids = self._get_uuids(documents) + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + + doc_ids = [] + + documents_to_insert = [ + {"text": text, "embedding": vector, "metadata": metadata} + for id, text, vector, metadata in zip(uuids, texts, embeddings, metadatas) + ] + for doc, id in zip(documents_to_insert, uuids): + result = self._scope.collection(self._collection_name).upsert(id, doc) + + doc_ids.extend(uuids) + + return doc_ids + + def text_exists(self, id: str) -> bool: + # Use a parameterized query for safety and correctness + query = f""" + SELECT COUNT(1) AS count FROM + `{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name} + WHERE META().id = $doc_id + """ + # Pass the id as a parameter to the query + result = self._cluster.query(query, named_parameters={"doc_id": id}).execute() + for row in result: + return row["count"] > 0 + return False # Return False if no rows are returned + + def delete_by_ids(self, ids: list[str]) -> None: + query = f""" + DELETE FROM `{self._bucket_name}`.{self._client_config.scope_name}.{self._collection_name} + WHERE META().id IN $doc_ids; + """ + try: + self._cluster.query(query, named_parameters={"doc_ids": ids}).execute() + except Exception as e: + logger.error(e) + + def delete_by_document_id(self, document_id: str): + query = f""" + DELETE FROM + `{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name} + WHERE META().id = $doc_id; + """ + self._cluster.query(query, named_parameters={"doc_id": document_id}).execute() + + # def get_ids_by_metadata_field(self, key: str, value: str): + # query = f""" + # SELECT id FROM + # `{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name} + # WHERE `metadata.{key}` = $value; + # """ + # result = self._cluster.query(query, named_parameters={'value':value}) + # return [row['id'] for row in result.rows()] + + def delete_by_metadata_field(self, key: str, value: str) -> None: + query = f""" + DELETE FROM `{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name} + WHERE metadata.{key} = $value; + """ + self._cluster.query(query, named_parameters={"value": value}).execute() + + def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: + top_k = kwargs.get("top_k", 5) + score_threshold = kwargs.get("score_threshold") or 0.0 + + search_req = search.SearchRequest.create( + VectorSearch.from_vector_query( + VectorQuery( + "embedding", + query_vector, + top_k, + ) + ) + ) + try: + search_iter = self._scope.search( + self._collection_name + "_search", + search_req, + SearchOptions(limit=top_k, collections=[self._collection_name], fields=["*"]), + ) + + docs = [] + # Parse the results + for row in search_iter.rows(): + text = row.fields.pop("text") + metadata = self._format_metadata(row.fields) + score = row.score + metadata["score"] = score + doc = Document(page_content=text, metadata=metadata) + if score >= score_threshold: + docs.append(doc) + except Exception as e: + raise ValueError(f"Search failed with error: {e}") + + return docs + + def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: + top_k = kwargs.get("top_k", 2) + try: + CBrequest = search.SearchRequest.create(search.QueryStringQuery("text:" + query)) + search_iter = self._scope.search( + self._collection_name + "_search", CBrequest, SearchOptions(limit=top_k, fields=["*"]) + ) + + docs = [] + for row in search_iter.rows(): + text = row.fields.pop("text") + metadata = self._format_metadata(row.fields) + score = row.score + metadata["score"] = score + doc = Document(page_content=text, metadata=metadata) + docs.append(doc) + + except Exception as e: + raise ValueError(f"Search failed with error: {e}") + + return docs + + def delete(self): + manager = self._bucket.collections() + scopes = manager.get_all_scopes() + + for scope in scopes: + for collection in scope.collections: + if collection.name == self._collection_name: + manager.drop_collection("_default", self._collection_name) + + def _format_metadata(self, row_fields: dict[str, Any]) -> dict[str, Any]: + """Helper method to format the metadata from the Couchbase Search API. + Args: + row_fields (Dict[str, Any]): The fields to format. + + Returns: + Dict[str, Any]: The formatted metadata. + """ + metadata = {} + for key, value in row_fields.items(): + # Couchbase Search returns the metadata key with a prefix + # `metadata.` We remove it to get the original metadata key + if key.startswith("metadata"): + new_key = key.split("metadata" + ".")[-1] + metadata[new_key] = value + else: + metadata[key] = value + + return metadata + + +class CouchbaseVectorFactory(AbstractVectorFactory): + def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> CouchbaseVector: + if dataset.index_struct_dict: + class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] + collection_name = class_prefix + else: + dataset_id = dataset.id + collection_name = Dataset.gen_collection_name_by_id(dataset_id) + dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.COUCHBASE, collection_name)) + + config = current_app.config + return CouchbaseVector( + collection_name=collection_name, + config=CouchbaseConfig( + connection_string=config.get("COUCHBASE_CONNECTION_STRING"), + user=config.get("COUCHBASE_USER"), + password=config.get("COUCHBASE_PASSWORD"), + bucket_name=config.get("COUCHBASE_BUCKET_NAME"), + scope_name=config.get("COUCHBASE_SCOPE_NAME"), + ), + ) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 59a5aadacd..87d19bf60b 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -114,6 +114,10 @@ class Vector: from core.rag.datasource.vdb.analyticdb.analyticdb_vector import AnalyticdbVectorFactory return AnalyticdbVectorFactory + case VectorType.COUCHBASE: + from core.rag.datasource.vdb.couchbase.couchbase_vector import CouchbaseVectorFactory + + return CouchbaseVectorFactory case VectorType.BAIDU: from core.rag.datasource.vdb.baidu.baidu_vector import BaiduVectorFactory diff --git a/api/core/rag/datasource/vdb/vector_type.py b/api/core/rag/datasource/vdb/vector_type.py index 3b6df94f78..7384c12ff7 100644 --- a/api/core/rag/datasource/vdb/vector_type.py +++ b/api/core/rag/datasource/vdb/vector_type.py @@ -16,6 +16,7 @@ class VectorType(str, Enum): TENCENT = "tencent" ORACLE = "oracle" ELASTICSEARCH = "elasticsearch" + COUCHBASE = "couchbase" BAIDU = "baidu" VIKINGDB = "vikingdb" UPSTASH = "upstash" diff --git a/api/poetry.lock b/api/poetry.lock index 618dbb4033..e1e5a6410b 100644 --- a/api/poetry.lock +++ b/api/poetry.lock @@ -1801,6 +1801,46 @@ requests = ">=2.8" six = "*" xmltodict = "*" +[[package]] +name = "couchbase" +version = "4.3.3" +description = "Python Client for Couchbase" +optional = false +python-versions = ">=3.7" +files = [ + {file = "couchbase-4.3.3-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:d8069e4f01332859d56cca597874645c914699162b3979d1b432f0dfc186b124"}, + {file = "couchbase-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1caa6cfef49c785b35b1702102f718227f351df87bba2694b9334520c41e9eb5"}, + {file = "couchbase-4.3.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f4a9a65c44935249fa078fb90a3c28ea71da9d2d5889fcd514b12d0538010ae0"}, + {file = "couchbase-4.3.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4f144b8c482c18283d8e419b844630d41f3249b07d43d40b5e3535444e57d0fb"}, + {file = "couchbase-4.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1c534fba6fdc7cf47eed9dee8a57d1e9eb867bf008574e321fa380a77cebf32f"}, + {file = "couchbase-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:b841be06e0e4370b69ebef6bca3409c378186f7d6e964cd645ba18e97216c022"}, + {file = "couchbase-4.3.3-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:eee7a73b3acbdc78ae314fddf7f975b3c9e05df07df255f4dcc878939a2abae0"}, + {file = "couchbase-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:53417cafcf90ff4e2fd81ebba2a08b7ad56f17160d1c5019ad3b09c758aeb363"}, + {file = "couchbase-4.3.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0cefd13bea8b0f150f1b9d27fd7614f971f77419b31817781d26ba315ed658bb"}, + {file = "couchbase-4.3.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:78fa1054d7740e2fe38fce0a2aab4e9a2d30263d894e0615ee5df297f02f59a3"}, + {file = "couchbase-4.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb093899cfad5a7472258a9b6a57775dbf23a6e0180241507ba89ce3ab241e41"}, + {file = "couchbase-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f7cfbdc699af5715f49365ffbb05a6a7366a534c0d7161edf270ad3e735a6c5d"}, + {file = "couchbase-4.3.3-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:58352cae9b8affdaa2ac012e0a03c8c2632ee6297a878232888b4e0360d0d5df"}, + {file = "couchbase-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:728e7e3b5e1682706cb9d63993d289226d02a25089527b8ecb4e3889dabc38cf"}, + {file = "couchbase-4.3.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:73014bf098cf14187a39cc13453e0d859c1d54568df28f69cc308a9a5f24feb2"}, + {file = "couchbase-4.3.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a743375804068ae01b73c916bfca738764c8c12f381bb399ef04e784935856a1"}, + {file = "couchbase-4.3.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:394c122cfe02a76a99e7d5178e64129f6da49843225e78d8629abcab556c24af"}, + {file = "couchbase-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:bf85d7a5cda548d9801614651206068b4445fa37972e62b14d7521a958198693"}, + {file = "couchbase-4.3.3-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:92d23c9cedd571631070791f2afee0e3d7d8c9ce1bf2ea6e9a4f2fdbc37a0f1e"}, + {file = "couchbase-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:38c42eb29a73cce2998ae5df45bd61b16dce9765d3bff968ec5cf6a622faa291"}, + {file = "couchbase-4.3.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:afed137bf0edc642d7b201b6ab7b1e7117bb4c8eac6b2f253cc6e106f334a2a1"}, + {file = "couchbase-4.3.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:954d991377d47883aaf903934c5d0f19577680a2abf80d3ce5bb9b3c80991fc7"}, + {file = "couchbase-4.3.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5552b9fa684630698dc98d6f3b1082540634c1b7ad5bf53b843b5da57b0169c"}, + {file = "couchbase-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:f88f2b7e0c894f7237d9f3fb5c46abc44b8151a97b3ca8e75f57d23ebf59f9da"}, + {file = "couchbase-4.3.3-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:769e1e2367ea1d4de181fcd4b4e353e9abef97d15b581a6c5aea49ece3dc7d59"}, + {file = "couchbase-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:47f59a0b35ffce060583fd11f98f049f3b70701cf14aab9ac092594aca486aeb"}, + {file = "couchbase-4.3.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:440bb93d611827ba0ea2403c6f204fe931467a6cb5811f0e03bf1779204ef843"}, + {file = "couchbase-4.3.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cdb4dde62e1d41c0b8707121ab68fa78b7a1508541bd48fc850be396f91bc8d9"}, + {file = "couchbase-4.3.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7f8cf45f317b39cc19db5c67b565662f08d6c90305b3aa14e04bc22707258213"}, + {file = "couchbase-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:c97d48ad486c8f201b4482d5594258f949369cb44792ed148d5159a3d12ae21b"}, + {file = "couchbase-4.3.3.tar.gz", hash = "sha256:27808500551564b39b46943cf3daab572694889c1eb638425d363edb48b20da7"}, +] + [[package]] name = "coverage" version = "7.2.7" @@ -6850,6 +6890,19 @@ files = [ {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"}, {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"}, {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"}, + {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"}, + {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"}, + {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"}, + {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"}, + {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"}, + {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"}, + {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"}, ] [package.dependencies] @@ -10866,4 +10919,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "1b268122d3d4771ba219f0e983322e0454b7b8644dba35da38d7d950d489e1ba" +content-hash = "52552faf5f4823056eb48afe05349ab2f0e9a5bc42105211ccbbb54b59e27b59" diff --git a/api/pyproject.toml b/api/pyproject.toml index a549601535..a3313f0ff5 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -239,6 +239,7 @@ alibabacloud_gpdb20160503 = "~3.8.0" alibabacloud_tea_openapi = "~0.3.9" chromadb = "0.5.1" clickhouse-connect = "~0.7.16" +couchbase = "~4.3.0" elasticsearch = "8.14.0" opensearch-py = "2.4.0" oracledb = "~2.2.1" diff --git a/api/tests/integration_tests/vdb/couchbase/__init__.py b/api/tests/integration_tests/vdb/couchbase/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/tests/integration_tests/vdb/couchbase/test_couchbase.py b/api/tests/integration_tests/vdb/couchbase/test_couchbase.py new file mode 100644 index 0000000000..d76c34ba0e --- /dev/null +++ b/api/tests/integration_tests/vdb/couchbase/test_couchbase.py @@ -0,0 +1,50 @@ +import subprocess +import time + +from core.rag.datasource.vdb.couchbase.couchbase_vector import CouchbaseConfig, CouchbaseVector +from tests.integration_tests.vdb.test_vector_store import ( + AbstractVectorTest, + get_example_text, + setup_mock_redis, +) + + +def wait_for_healthy_container(service_name="couchbase-server", timeout=300): + start_time = time.time() + while time.time() - start_time < timeout: + result = subprocess.run( + ["docker", "inspect", "--format", "{{.State.Health.Status}}", service_name], capture_output=True, text=True + ) + if result.stdout.strip() == "healthy": + print(f"{service_name} is healthy!") + return True + else: + print(f"Waiting for {service_name} to be healthy...") + time.sleep(10) + raise TimeoutError(f"{service_name} did not become healthy in time") + + +class CouchbaseTest(AbstractVectorTest): + def __init__(self): + super().__init__() + self.vector = CouchbaseVector( + collection_name=self.collection_name, + config=CouchbaseConfig( + connection_string="couchbase://127.0.0.1", + user="Administrator", + password="password", + bucket_name="Embeddings", + scope_name="_default", + ), + ) + + def search_by_vector(self): + # brief sleep to ensure document is indexed + time.sleep(5) + hits_by_vector = self.vector.search_by_vector(query_vector=self.example_embedding) + assert len(hits_by_vector) == 1 + + +def test_couchbase(setup_mock_redis): + wait_for_healthy_container("couchbase-server", timeout=60) + CouchbaseTest().run_all_tests() diff --git a/dev/pytest/pytest_vdb.sh b/dev/pytest/pytest_vdb.sh index 579da6a30e..418a129693 100755 --- a/dev/pytest/pytest_vdb.sh +++ b/dev/pytest/pytest_vdb.sh @@ -11,4 +11,5 @@ pytest api/tests/integration_tests/vdb/chroma \ api/tests/integration_tests/vdb/vikingdb \ api/tests/integration_tests/vdb/baidu \ api/tests/integration_tests/vdb/tcvectordb \ - api/tests/integration_tests/vdb/upstash \ No newline at end of file + api/tests/integration_tests/vdb/upstash \ + api/tests/integration_tests/vdb/couchbase \ diff --git a/docker/.env.example b/docker/.env.example index 49ce48a20d..c506a9d92e 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -375,7 +375,7 @@ SUPABASE_URL=your-server-url # ------------------------------ # The type of vector store to use. -# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `vikingdb`. +# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`. VECTOR_STORE=weaviate # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`. @@ -414,6 +414,14 @@ MYSCALE_PASSWORD= MYSCALE_DATABASE=dify MYSCALE_FTS_PARAMS= +# Couchbase configurations, only available when VECTOR_STORE is `couchbase` +# The connection string must include hostname defined in the docker-compose file (couchbase-server in this case) +COUCHBASE_CONNECTION_STRING=couchbase://couchbase-server +COUCHBASE_USER=Administrator +COUCHBASE_PASSWORD=password +COUCHBASE_BUCKET_NAME=Embeddings +COUCHBASE_SCOPE_NAME=_default + # pgvector configurations, only available when VECTOR_STORE is `pgvector` PGVECTOR_HOST=pgvector PGVECTOR_PORT=5432 diff --git a/docker/couchbase-server/Dockerfile b/docker/couchbase-server/Dockerfile new file mode 100644 index 0000000000..bd8af64150 --- /dev/null +++ b/docker/couchbase-server/Dockerfile @@ -0,0 +1,4 @@ +FROM couchbase/server:latest AS stage_base +# FROM couchbase:latest AS stage_base +COPY init-cbserver.sh /opt/couchbase/init/ +RUN chmod +x /opt/couchbase/init/init-cbserver.sh \ No newline at end of file diff --git a/docker/couchbase-server/init-cbserver.sh b/docker/couchbase-server/init-cbserver.sh new file mode 100755 index 0000000000..e66bc18530 --- /dev/null +++ b/docker/couchbase-server/init-cbserver.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# used to start couchbase server - can't get around this as docker compose only allows you to start one command - so we have to start couchbase like the standard couchbase Dockerfile would +# https://github.com/couchbase/docker/blob/master/enterprise/couchbase-server/7.2.0/Dockerfile#L88 + +/entrypoint.sh couchbase-server & + +# track if setup is complete so we don't try to setup again +FILE=/opt/couchbase/init/setupComplete.txt + +if ! [ -f "$FILE" ]; then + # used to automatically create the cluster based on environment variables + # https://docs.couchbase.com/server/current/cli/cbcli/couchbase-cli-cluster-init.html + + echo $COUCHBASE_ADMINISTRATOR_USERNAME ":" $COUCHBASE_ADMINISTRATOR_PASSWORD + + sleep 20s + /opt/couchbase/bin/couchbase-cli cluster-init -c 127.0.0.1 \ + --cluster-username $COUCHBASE_ADMINISTRATOR_USERNAME \ + --cluster-password $COUCHBASE_ADMINISTRATOR_PASSWORD \ + --services data,index,query,fts \ + --cluster-ramsize $COUCHBASE_RAM_SIZE \ + --cluster-index-ramsize $COUCHBASE_INDEX_RAM_SIZE \ + --cluster-eventing-ramsize $COUCHBASE_EVENTING_RAM_SIZE \ + --cluster-fts-ramsize $COUCHBASE_FTS_RAM_SIZE \ + --index-storage-setting default + + sleep 2s + + # used to auto create the bucket based on environment variables + # https://docs.couchbase.com/server/current/cli/cbcli/couchbase-cli-bucket-create.html + + /opt/couchbase/bin/couchbase-cli bucket-create -c localhost:8091 \ + --username $COUCHBASE_ADMINISTRATOR_USERNAME \ + --password $COUCHBASE_ADMINISTRATOR_PASSWORD \ + --bucket $COUCHBASE_BUCKET \ + --bucket-ramsize $COUCHBASE_BUCKET_RAMSIZE \ + --bucket-type couchbase + + # create file so we know that the cluster is setup and don't run the setup again + touch $FILE +fi + # docker compose will stop the container from running unless we do this + # known issue and workaround + tail -f /dev/null diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index bf8a8b07e6..2a756ead9c 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -110,6 +110,11 @@ x-shared-env: &shared-api-worker-env QDRANT_CLIENT_TIMEOUT: ${QDRANT_CLIENT_TIMEOUT:-20} QDRANT_GRPC_ENABLED: ${QDRANT_GRPC_ENABLED:-false} QDRANT_GRPC_PORT: ${QDRANT_GRPC_PORT:-6334} + COUCHBASE_CONNECTION_STRING: ${COUCHBASE_CONNECTION_STRING:-'couchbase-server'} + COUCHBASE_USER: ${COUCHBASE_USER:-Administrator} + COUCHBASE_PASSWORD: ${COUCHBASE_PASSWORD:-password} + COUCHBASE_BUCKET_NAME: ${COUCHBASE_BUCKET_NAME:-Embeddings} + COUCHBASE_SCOPE_NAME: ${COUCHBASE_SCOPE_NAME:-_default} MILVUS_URI: ${MILVUS_URI:-http://127.0.0.1:19530} MILVUS_TOKEN: ${MILVUS_TOKEN:-} MILVUS_USER: ${MILVUS_USER:-root} @@ -475,6 +480,39 @@ services: environment: QDRANT_API_KEY: ${QDRANT_API_KEY:-difyai123456} + # The Couchbase vector store. + couchbase-server: + build: ./couchbase-server + profiles: + - couchbase + restart: always + environment: + - CLUSTER_NAME=dify_search + - COUCHBASE_ADMINISTRATOR_USERNAME=${COUCHBASE_USER:-Administrator} + - COUCHBASE_ADMINISTRATOR_PASSWORD=${COUCHBASE_PASSWORD:-password} + - COUCHBASE_BUCKET=${COUCHBASE_BUCKET_NAME:-Embeddings} + - COUCHBASE_BUCKET_RAMSIZE=512 + - COUCHBASE_RAM_SIZE=2048 + - COUCHBASE_EVENTING_RAM_SIZE=512 + - COUCHBASE_INDEX_RAM_SIZE=512 + - COUCHBASE_FTS_RAM_SIZE=1024 + hostname: couchbase-server + container_name: couchbase-server + working_dir: /opt/couchbase + stdin_open: true + tty: true + entrypoint: [""] + command: sh -c "/opt/couchbase/init/init-cbserver.sh" + volumes: + - ./volumes/couchbase/data:/opt/couchbase/var/lib/couchbase/data + healthcheck: + # ensure bucket was created before proceeding + test: [ "CMD-SHELL", "curl -s -f -u Administrator:password http://localhost:8091/pools/default/buckets | grep -q '\\[{' || exit 1" ] + interval: 10s + retries: 10 + start_period: 30s + timeout: 10s + # The pgvector vector database. pgvector: image: pgvector/pgvector:pg16