import logging import multiprocessing from multiprocessing.connection import Connection import multiprocessing.context import time from typing import Generator, Callable, List, Tuple, cast from uuid import UUID from hypothesis import given import hypothesis.strategies as st import pytest import chromadb from chromadb.api import ClientAPI, ServerAPI from chromadb.config import Settings, System from chromadb.segment import VectorReader from chromadb.segment.impl.manager.local import LocalSegmentManager import chromadb.test.property.strategies as strategies import chromadb.test.property.invariants as invariants from strategies import hashing_embedding_function from chromadb.test.property.test_embeddings import ( EmbeddingStateMachineStates, trace, EmbeddingStateMachineBase, ) from hypothesis.stateful import ( run_state_machine_as_test, rule, precondition, initialize, MultipleResults, ) import os import shutil import tempfile from chromadb.api.client import Client as ClientCreator from chromadb.utils.embedding_functions import DefaultEmbeddingFunction import numpy as np CreatePersistAPI = Callable[[], ServerAPI] configurations = [ Settings( chroma_api_impl="chromadb.api.segment.SegmentAPI", chroma_sysdb_impl="chromadb.db.impl.sqlite.SqliteDB", chroma_producer_impl="chromadb.db.impl.sqlite.SqliteDB", chroma_consumer_impl="chromadb.db.impl.sqlite.SqliteDB", chroma_segment_manager_impl="chromadb.segment.impl.manager.local.LocalSegmentManager", allow_reset=True, is_persistent=True, persist_directory=tempfile.mkdtemp(), ), ] @pytest.fixture(scope="module", params=configurations) def settings(request: pytest.FixtureRequest) -> Generator[Settings, None, None]: configuration = request.param save_path = configuration.persist_directory # Create if it doesn't exist if not os.path.exists(save_path): os.makedirs(save_path, exist_ok=True) yield configuration # Remove if it exists if os.path.exists(save_path): shutil.rmtree(save_path, ignore_errors=True) collection_st = st.shared( strategies.collections( with_hnsw_params=True, with_persistent_hnsw_params=st.just(True), # Makes it more likely to find persist-related bugs (by default these are set to 2000). # Lower values make it more likely that a test will trigger a persist to disk. max_hnsw_batch_size=10, max_hnsw_sync_threshold=10, ), key="coll", ) @st.composite def collection_and_recordset_strategy( draw: st.DrawFn, ) -> Tuple[strategies.Collection, strategies.RecordSet]: collection = draw( strategies.collections( with_hnsw_params=True, with_persistent_hnsw_params=st.just(True), # Makes it more likely to find persist-related bugs (by default these are set to 2000). max_hnsw_batch_size=10, max_hnsw_sync_threshold=10, ) ) recordset = draw(strategies.recordsets(st.just(collection))) return collection, recordset @given( collection_and_recordset_strategies=st.lists( collection_and_recordset_strategy(), min_size=1, unique_by=(lambda x: x[0].name, lambda x: x[0].name), ) ) def test_persist( settings: Settings, collection_and_recordset_strategies: List[ Tuple[strategies.Collection, strategies.RecordSet] ], ) -> None: system_1 = System(settings) system_1.start() client_1 = ClientCreator.from_system(system_1) client_1.reset() for ( collection_strategy, recordset_strategy, ) in collection_and_recordset_strategies: coll = client_1.create_collection( name=collection_strategy.name, metadata=collection_strategy.metadata, # type: ignore[arg-type] embedding_function=collection_strategy.embedding_function, ) coll.add(**recordset_strategy) # type: ignore[arg-type] invariants.count(coll, recordset_strategy) invariants.metadatas_match(coll, recordset_strategy) invariants.documents_match(coll, recordset_strategy) invariants.ids_match(coll, recordset_strategy) invariants.ann_accuracy( coll, recordset_strategy, embedding_function=collection_strategy.embedding_function, ) system_1.stop() del client_1 del system_1 system_2 = System(settings) system_2.start() client_2 = ClientCreator.from_system(system_2) for ( collection_strategy, recordset_strategy, ) in collection_and_recordset_strategies: coll = client_2.get_collection( name=collection_strategy.name, embedding_function=collection_strategy.embedding_function, ) invariants.count(coll, recordset_strategy) invariants.metadatas_match(coll, recordset_strategy) invariants.documents_match(coll, recordset_strategy) invariants.ids_match(coll, recordset_strategy) invariants.ann_accuracy( coll, recordset_strategy, embedding_function=collection_strategy.embedding_function, ) system_2.stop() del client_2 del system_2 def test_sync_threshold(settings: Settings) -> None: system = System(settings) system.start() client = ClientCreator.from_system(system) collection = client.create_collection( name="test", metadata={"hnsw:batch_size": 3, "hnsw:sync_threshold": 3} ) manager = system.instance(LocalSegmentManager) segment = manager.get_segment(collection.id, VectorReader) def get_index_last_modified_at() -> float: # Time resolution on Windows can be up to 10ms time.sleep(0.1) try: return os.path.getmtime(segment._get_metadata_file()) # type: ignore[attr-defined] except FileNotFoundError: return -1 last_modified_at = get_index_last_modified_at() collection.add(ids=["1", "2"], embeddings=[[1.0], [2.0]]) # type: ignore[arg-type] # Should not have yet persisted assert get_index_last_modified_at() == last_modified_at last_modified_at = get_index_last_modified_at() # Now there's 3 additions, and the sync threshold is 3... collection.add(ids=["3"], embeddings=[[3.0]]) # type: ignore[arg-type] # ...so it should have persisted assert get_index_last_modified_at() > last_modified_at last_modified_at = get_index_last_modified_at() # The same thing should happen with upserts collection.upsert(ids=["1", "2", "3"], embeddings=[[1.0], [2.0], [3.0]]) # type: ignore[arg-type] # Should have persisted assert get_index_last_modified_at() > last_modified_at last_modified_at = get_index_last_modified_at() # Mixed usage should also trigger persistence collection.add(ids=["4"], embeddings=[[4.0]]) # type: ignore[arg-type] collection.upsert(ids=["1", "2"], embeddings=[[1.0], [2.0]]) # type: ignore[arg-type] # Should have persisted assert get_index_last_modified_at() > last_modified_at last_modified_at = get_index_last_modified_at() # Invalid updates should also trigger persistence collection.add(ids=["5"], embeddings=[[5.0]]) # type: ignore[arg-type] collection.add(ids=["1", "2"], embeddings=[[1.0], [2.0]]) # type: ignore[arg-type] # Should have persisted assert get_index_last_modified_at() > last_modified_at last_modified_at = get_index_last_modified_at() def load_and_check( settings: Settings, collection_name: str, record_set: strategies.RecordSet, conn: Connection, ) -> None: try: system = System(settings) system.start() client = ClientCreator.from_system(system) coll = client.get_collection( name=collection_name, embedding_function=strategies.not_implemented_embedding_function(), # type: ignore[arg-type] ) invariants.count(coll, record_set) invariants.metadatas_match(coll, record_set) invariants.documents_match(coll, record_set) invariants.ids_match(coll, record_set) invariants.ann_accuracy(coll, record_set) system.stop() except Exception as e: conn.send(e) raise e def get_multiprocessing_context(): # type: ignore[no-untyped-def] try: # Run the invariants in a new process to bypass any shared state/caching (which would defeat the purpose of the test) # (forkserver is used because it's much faster than spawn—it will spawn a new, minimal singleton process and then fork that singleton) ctx = multiprocessing.get_context("forkserver") # This is like running `import chromadb` in the single process that is forked rather than importing it in each forked process. # Gives a ~3x speedup since importing chromadb is fairly expensive. ctx.set_forkserver_preload(["chromadb"]) return ctx except Exception: # forkserver/fork is not available on Windows return multiprocessing.get_context("spawn") class PersistEmbeddingsStateMachineStates(EmbeddingStateMachineStates): persist = "persist" MIN_STATE_CHANGES_BEFORE_PERSIST = 5 class PersistEmbeddingsStateMachine(EmbeddingStateMachineBase): def __init__(self, client: ClientAPI, settings: Settings): self.client = client self.settings = settings self.min_state_changes_left_before_persisting = MIN_STATE_CHANGES_BEFORE_PERSIST self.client.reset() super().__init__(self.client) @initialize(collection=collection_st) # type: ignore def initialize(self, collection: strategies.Collection): self.client.reset() self.collection = self.client.create_collection( name=collection.name, metadata=collection.metadata, # type: ignore[arg-type] embedding_function=collection.embedding_function, ) self.embedding_function = collection.embedding_function trace("init") self.on_state_change(EmbeddingStateMachineStates.initialize) self.record_set_state = strategies.StateMachineRecordSet( ids=[], metadatas=[], documents=[], embeddings=[] ) @precondition( lambda self: len(self.record_set_state["ids"]) >= 1 and self.min_state_changes_left_before_persisting <= 0 ) @rule() def persist(self) -> None: self.on_state_change(PersistEmbeddingsStateMachineStates.persist) collection_name = self.collection.name conn1, conn2 = multiprocessing.Pipe() ctx = get_multiprocessing_context() # type: ignore[no-untyped-call] p = ctx.Process( target=load_and_check, args=(self.settings, collection_name, self.record_set_state, conn2), ) p.start() p.join() if conn1.poll(): e = conn1.recv() raise e p.close() def on_state_change(self, new_state: str) -> None: super().on_state_change(new_state) if new_state == PersistEmbeddingsStateMachineStates.persist: self.min_state_changes_left_before_persisting = ( MIN_STATE_CHANGES_BEFORE_PERSIST ) else: self.min_state_changes_left_before_persisting -= 1 def teardown(self) -> None: self.client.reset() def test_persist_embeddings_state( caplog: pytest.LogCaptureFixture, settings: Settings ) -> None: caplog.set_level(logging.ERROR) client = chromadb.Client(settings) run_state_machine_as_test( lambda: PersistEmbeddingsStateMachine(settings=settings, client=client), ) # type: ignore def test_delete_less_than_k( caplog: pytest.LogCaptureFixture, settings: Settings ) -> None: client = chromadb.Client(settings) state = PersistEmbeddingsStateMachine(settings=settings, client=client) state.initialize( collection=strategies.Collection( name="A00", metadata={ "hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128, "hnsw:sync_threshold": 3, "hnsw:batch_size": 3, }, embedding_function=None, id=UUID("2d3eddc7-2314-45f4-a951-47a9a8e099d2"), dimension=2, dtype=np.float16, known_metadata_keys={}, known_document_keywords=[], has_documents=False, has_embeddings=True, ) ) state.ann_accuracy() state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() (embedding_ids_0,) = state.add_embeddings(record_set={"ids": ["0"], "embeddings": [[0.09765625, 0.430419921875]], "metadatas": [None], "documents": None}) # type: ignore state.ann_accuracy() # recall: 1.0, missing 0 out of 1, accuracy threshold 1e-06 state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() embedding_ids_1, embedding_ids_2 = state.add_embeddings(record_set={"ids": ["1", "2"], "embeddings": [[0.20556640625, 0.08978271484375], [-0.1527099609375, 0.291748046875]], "metadatas": [None, None], "documents": None}) # type: ignore state.ann_accuracy() # recall: 1.0, missing 0 out of 3, accuracy threshold 1e-06 state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() state.delete_by_ids(ids=[embedding_ids_2]) state.ann_accuracy() state.teardown() # Ideally this scenario would be exercised by Hypothesis, but most runs don't seem to trigger this particular state. def test_delete_add_after_persist(settings: Settings) -> None: client = chromadb.Client(settings) state = PersistEmbeddingsStateMachine(settings=settings, client=client) state.initialize( collection=strategies.Collection( name="A00", metadata={ "hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128, # Important: both batch_size and sync_threshold are 3 "hnsw:batch_size": 3, "hnsw:sync_threshold": 3, }, embedding_function=DefaultEmbeddingFunction(), # type: ignore[arg-type] id=UUID("0851f751-2f11-4424-ab23-4ae97074887a"), dimension=2, dtype=None, known_metadata_keys={}, known_document_keywords=[], has_documents=False, has_embeddings=True, ) ) state.add_embeddings( record_set={ # Add 3 records to hit the batch_size and sync_threshold "ids": ["0", "1", "2"], "embeddings": [[0, 0], [0, 0], [0, 0]], "metadatas": [None, None, None], "documents": None, } ) # Delete and then re-add record state.delete_by_ids(ids=["0"]) state.add_embeddings( record_set={ "ids": ["0"], "embeddings": [[1, 1]], "metadatas": [None], "documents": None, } ) # At this point, the changes above are not fully persisted state.fields_match() def test_batch_size_less_than_sync_with_duplicate_adds_results_in_skipped_seq_ids( caplog: pytest.LogCaptureFixture, settings: Settings ) -> None: # NOTE(hammadb) this test was autogenerate by hypothesis and added here to ensure that the test is run # in the future. It tests a case where the max seq id was incorrect in response to the same # id being added multiple times in a bathc. client = chromadb.Client(settings) state = PersistEmbeddingsStateMachine(settings=settings, client=client) state.initialize( collection=strategies.Collection( name="JqzMs4pPm14c\n", metadata={ "hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128, "hnsw:sync_threshold": 9, "hnsw:batch_size": 7, }, embedding_function=hashing_embedding_function(dim=92, dtype=np.float64), id=UUID("45c5c816-0a90-4293-8d01-4325ff860040"), dimension=92, dtype=np.float64, known_metadata_keys={}, known_document_keywords=[], has_documents=False, has_embeddings=True, ) ) state.ann_accuracy() state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() ( embedding_ids_0, embedding_ids_1, embedding_ids_2, embedding_ids_3, embedding_ids_4, embedding_ids_5, embedding_ids_6, ) = cast( MultipleResults[str], state.add_embeddings( record_set={ "ids": ["N", "e8r6", "4", "Yao", "qFjA2c", "jHCv", "2"], "embeddings": [ [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [2.0, 2.0, 2.0], [3.0, 3.0, 3.0], [4.0, 4.0, 4.0], [5.0, 5.0, 5.0], [6.0, 6.0, 6.0], ], "metadatas": None, "documents": None, } ), ) state.ann_accuracy() # recall: 1.0, missing 0 out of 7, accuracy threshold 1e-06 state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() print("\n\n") (_) = state.add_embeddings( record_set={ "ids": ["MVu393QTc"], "embeddings": [[7.0, 7.0, 7.0]], "metadatas": None, "documents": None, } ) state.ann_accuracy() # recall: 1.0, missing 0 out of 8, accuracy threshold 1e-06 state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() ( _, _, _, _, embedding_ids_12, _, _, _, _, embedding_ids_17, embedding_ids_18, _, _, _, embedding_ids_22, _, _, ) = cast( MultipleResults[str], state.add_embeddings( record_set={ "ids": [ "CyF0Mk-", "q_Fwu", "2D2sQSFogDgPLkcfT", "SrwuQHQ6w4f51qWr2enLPQw8uKYs1", "G", "wdzt", "5W", "8tpsn", "fJbV7z", "5", "V", "1iFkoJX", "Zw4u", "Fc", "7", "vEEwrP", "Yf", ], "embeddings": [ [8.0, 8.0, 8.0], [9.0, 9.0, 9.0], [10.0, 10.0, 10.0], [11.0, 11.0, 11.0], [12.0, 12.0, 12.0], [13.0, 13.0, 13.0], [14.0, 14.0, 14.0], [15.0, 15.0, 15.0], [16.0, 16.0, 16.0], [17.0, 17.0, 17.0], [18.0, 18.0, 18.0], [19.0, 19.0, 19.0], [20.0, 20.0, 20.0], [21.0, 21.0, 21.0], [22.0, 22.0, 22.0], [23.0, 23.0, 23.0], [24.0, 24.0, 24.0], ], "metadatas": None, "documents": None, } ), ) state.ann_accuracy() state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() state.add_embeddings( record_set={ "ids": ["0", "df_RWhR0HelOcv"], "embeddings": [[25.0, 25.0, 25.0], [26.0, 26.0, 26.0]], "metadatas": [None, None], "documents": None, } ) state.ann_accuracy() state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() state.add_embeddings( record_set={ "ids": ["3R", "9_", "44u", "3B", "MZCXZDS", "Uelx"], "embeddings": [ [27.0, 27.0, 27.0], [28.0, 28.0, 28.0], [29.0, 29.0, 29.0], [30.0, 30.0, 30.0], [31.0, 31.0, 31.0], [32.0, 32.0, 32.0], ], "metadatas": None, "documents": None, } ) state.ann_accuracy() state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() state.persist() state.ann_accuracy() state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() state.add_embeddings( record_set={ "ids": "YlVm", "embeddings": [[33.0, 33.0, 33.0]], "metadatas": None, "documents": None, } ) state.ann_accuracy() # recall: 1.0, missing 0 out of 34, accuracy threshold 1e-06 state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() state.add_embeddings( record_set={ "ids": ["Rk1", "TPL"], "embeddings": [[34.0, 34.0, 34.0], [35.0, 35.0, 35.0]], "metadatas": [None, None], "documents": None, } ) state.ann_accuracy() # recall: 1.0, missing 0 out of 36, accuracy threshold 1e-06 state.count() state.fields_match() state.log_size_below_max() state.no_duplicates() state.add_embeddings( record_set={ "ids": [ "CyF0Mk-", "q_Fwu", "2D2sQSFogDgPLkcfT", "SrwuQHQ6w4f51qWr2enLPQw8uKYs1", embedding_ids_12, "wdzt", "5W", "8tpsn", "fJbV7z", embedding_ids_17, embedding_ids_18, "1iFkoJX", "Zw4u", "Fc", embedding_ids_22, "vEEwrP", "Yf", ], "embeddings": [ [8.0, 8.0, 8.0], [9.0, 9.0, 9.0], [10.0, 10.0, 10.0], [11.0, 11.0, 11.0], [12.0, 12.0, 12.0], [13.0, 13.0, 13.0], [14.0, 14.0, 14.0], [15.0, 15.0, 15.0], [16.0, 16.0, 16.0], [17.0, 17.0, 17.0], [18.0, 18.0, 18.0], [19.0, 19.0, 19.0], [20.0, 20.0, 20.0], [21.0, 21.0, 21.0], [22.0, 22.0, 22.0], [23.0, 23.0, 23.0], [24.0, 24.0, 24.0], ], "metadatas": None, "documents": None, } ) state.ann_accuracy() state.count() state.fields_match() state.log_size_below_max() state.teardown()