Improve: Serialize concurrent same-Index Python access with a mutex

ashvardanian · ashvardanian · commit 47528b5d69a5 · 2026-05-23T22:46:02.000+01:00
The documented contract is one Python thread per `Index` at a time -
the native `index_dense_t` carries per-worker `cast_buffer_` slots
indexed by an executor-local `thread_idx` the binding picks for each
call, so two Python threads spawning their own executors collide on
those slots. After releasing the GIL around long C++ ops in the
previous commit, accidental concurrent access from two Python threads
went from "silently incorrect" to "segfault". Defensive enforcement is
the right behavior.

Add a `unique_ptr&lt;std::mutex&gt;` to `dense_index_py_t` and
`dense_indexes_py_t` (held via `unique_ptr` so the wrapper stays
move-constructible for pybind11's factories). Each binding entry point
that releases the GIL now acquires the per-index mutex first; the
order is GIL-release-then-lock so a Python thread waiting on the mutex
does not hold the GIL - otherwise the current owner's worker thread
would block forever in `gil_scoped_acquire` when invoking the progress
callback. Hoist `try_reserve` into the locked region in every site so
concurrent callers can't race on it either. Heavy ops covered: bulk
add/search, multi-shard search, cluster, join, compact, isolate from
`remove(compact=True)`, multi-shard merge.

Extend `test_gil_release.py` with `test_concurrent_access_serializes_safely`
that spawns six Python threads (adders with disjoint key ranges,
searchers, lock-free getters) on a single index and asserts every key
landed, no thread errors, no crash. Validated under both stock CPython
3.14 and free-threaded 3.14t (`Py_GIL_DISABLED=1`); existing 608-test
suite passes unchanged.
diff --git a/python/lib.cpp b/python/lib.cpp
@@ -75,11 +75,24 @@ struct dense_index_py_t : public index_dense_t {
     using native_t::search;
     using native_t::size;
 
+    // Serializes Python-thread access to the underlying C++ index. The native
+    // `index_dense_t` assumes a single owning Python thread (it carries
+    // per-worker `cast_buffer_` slots indexed by an executor-local `thread_idx`
+    // that the binding picks for each call). Multiple Python threads calling
+    // a heavy op on the same index would otherwise collide on those slots.
+    // Locked around every binding entry point that releases the GIL.
+    //
+    // Held via `unique_ptr` so that the wrapper remains move-constructible -
+    // `std::mutex` itself is neither copyable nor movable, and pybind11's
+    // return-by-value factories require movability.
+    mutable std::unique_ptr<std::mutex> mutex_ptr_ = std::make_unique<std::mutex>();
+
     dense_index_py_t(native_t&& base) : index_dense_t(std::move(base)) {}
 };
 
 struct dense_indexes_py_t {
     std::vector<std::shared_ptr<dense_index_py_t>> shards_;
+    mutable std::unique_ptr<std::mutex> mutex_ptr_ = std::make_unique<std::mutex>();
 
     void merge(std::shared_ptr<dense_index_py_t> shard) { shards_.push_back(shard); }
     std::size_t bytes_per_vector() const noexcept { return shards_.empty() ? 0 : shards_[0]->bytes_per_vector(); }
@@ -92,7 +105,12 @@ struct dense_indexes_py_t {
 
         shards_.reserve(shards_.size() + paths.size());
         std::mutex shards_mutex;
+        // Release the GIL *before* taking the per-index mutex so a Python
+        // thread waiting on the mutex doesn't hold the GIL - otherwise a
+        // worker thread in the current owner would block forever in
+        // `gil_scoped_acquire`.
         py::gil_scoped_release release;
+        std::unique_lock<std::mutex> lock(*mutex_ptr_);
         executor_default_t{threads}.dynamic(paths.size(), [&](std::size_t, std::size_t task_idx) {
             index_dense_t index = index_dense_t::make(paths[task_idx].c_str(), view);
             if (!index)
@@ -192,6 +210,9 @@ static void add_typed_to_index(                                            //
 
     {
         py::gil_scoped_release release;
+        std::unique_lock<std::mutex> lock(*index.mutex_ptr_);
+        if (!index.try_reserve(index_limits_t(ceil2(index.size() + vectors_count), threads)))
+            throw std::invalid_argument("Out of memory!");
         executor_default_t{threads}.dynamic(vectors_count, [&](std::size_t thread_idx, std::size_t task_idx) {
             dense_key_t key = *reinterpret_cast<dense_key_t const*>(keys_data + task_idx * keys_info.strides[0]);
             scalar_at const* vector =
@@ -259,8 +280,10 @@ static void add_many_to_index(                            //
 
     if (!threads)
         threads = std::thread::hardware_concurrency();
-    if (!index.try_reserve(index_limits_t(ceil2(index.size() + vectors_count), threads)))
-        throw std::invalid_argument("Out of memory!");
+
+    // `add_typed_to_index` does the `try_reserve` + executor work inside its
+    // own GIL-released, mutex-locked region; we just dispatch on the scalar
+    // kind here.
 
     // clang-format off
     scalar_kind_t kind = (scalar_kind != scalar_kind_t::unknown_k)
@@ -300,8 +323,6 @@ static void search_typed(                                   //
 
     if (!threads)
         threads = std::thread::hardware_concurrency();
-    if (!index.try_reserve(index_limits_t(index.size(), threads)))
-        throw std::invalid_argument("Out of memory!");
 
     // Progress status
     progress_t progress_{progress};
@@ -310,6 +331,9 @@ static void search_typed(                                   //
     atomic_error_t atomic_error{nullptr};
     {
         py::gil_scoped_release release;
+        std::unique_lock<std::mutex> lock(*index.mutex_ptr_);
+        if (!index.try_reserve(index_limits_t(index.size(), threads)))
+            throw std::invalid_argument("Out of memory!");
         executor_default_t{threads}.dynamic(vectors_count, [&](std::size_t thread_idx, std::size_t task_idx) {
             scalar_at const* vector = (scalar_at const*)(vectors_data + task_idx * vectors_info.strides[0]);
             dense_search_result_t result = index.search(vector, wanted, thread_idx, exact);
@@ -379,6 +403,7 @@ static void search_typed(                                       //
     atomic_error_t atomic_error{nullptr};
     {
         py::gil_scoped_release release;
+        std::unique_lock<std::mutex> lock(*indexes.mutex_ptr_);
         executor_default_t{threads}.dynamic(indexes.shards_.size(), [&](std::size_t thread_idx, std::size_t task_idx) {
             dense_index_py_t& index = *indexes.shards_[task_idx].get();
 
@@ -760,6 +785,7 @@ static py::tuple cluster_vectors(        //
         : numpy_string_to_kind(queries_info.format);
     {
         py::gil_scoped_release release;
+        std::unique_lock<std::mutex> lock(*index.mutex_ptr_);
         switch (kind) {
         case scalar_kind_t::f64_k: cluster_result = index.cluster(queries_begin.as<f64_t const>(), queries_end.as<f64_t const>(), config, keys_ptr, distances_ptr, executor, progress_t{progress}); break;
         case scalar_kind_t::f32_k: cluster_result = index.cluster(queries_begin.as<f32_t const>(), queries_end.as<f32_t const>(), config, keys_ptr, distances_ptr, executor, progress_t{progress}); break;
@@ -834,6 +860,7 @@ static py::tuple cluster_keys(                            //
     dense_clustering_result_t cluster_result;
     {
         py::gil_scoped_release release;
+        std::unique_lock<std::mutex> lock(*index.mutex_ptr_);
         cluster_result =
             index.cluster(queries_begin, queries_end, config, keys_ptr, distances_ptr, executor, progress_t{progress});
     }
@@ -871,7 +898,11 @@ static std::unordered_map<dense_key_t, dense_key_t> join_index( //
     executor_default_t executor{threads};
     join_result_t result;
     {
+        // Lock the receiver `a`; `b` is read-only from this side. Concurrent
+        // bidirectional `join(a, b)` and `join(b, a)` from two Python threads
+        // is unsupported.
         py::gil_scoped_release release;
+        std::unique_lock<std::mutex> lock(*a.mutex_ptr_);
         result = a.join(b, config, a_to_b, b_to_a, executor, progress_t{progress});
     }
     forward_error(result);
@@ -893,10 +924,11 @@ static void compact_index(dense_index_py_t& index, std::size_t threads, progress
 
     if (!threads)
         threads = std::thread::hardware_concurrency();
-    if (!index.try_reserve(index_limits_t(index.size(), threads)))
-        throw std::invalid_argument("Out of memory!");
 
     py::gil_scoped_release release;
+    std::unique_lock<std::mutex> lock(*index.mutex_ptr_);
+    if (!index.try_reserve(index_limits_t(index.size(), threads)))
+        throw std::invalid_argument("Out of memory!");
     index.compact(executor_default_t{threads}, progress_t{progress});
 }
 
@@ -1316,10 +1348,11 @@ PYBIND11_MODULE(compiled, m, py::mod_gil_not_used()) {
 
             if (!threads)
                 threads = std::thread::hardware_concurrency();
-            if (!index.try_reserve(index_limits_t(index.size(), threads)))
-                throw std::invalid_argument("Out of memory!");
 
             py::gil_scoped_release release;
+            std::unique_lock<std::mutex> lock(*index.mutex_ptr_);
+            if (!index.try_reserve(index_limits_t(index.size(), threads)))
+                throw std::invalid_argument("Out of memory!");
             index.isolate(executor_default_t{threads});
             return result.completed;
         },
@@ -1336,10 +1369,11 @@ PYBIND11_MODULE(compiled, m, py::mod_gil_not_used()) {
 
             if (!threads)
                 threads = std::thread::hardware_concurrency();
-            if (!index.try_reserve(index_limits_t(index.size(), threads)))
-                throw std::invalid_argument("Out of memory!");
 
             py::gil_scoped_release release;
+            std::unique_lock<std::mutex> lock(*index.mutex_ptr_);
+            if (!index.try_reserve(index_limits_t(index.size(), threads)))
+                throw std::invalid_argument("Out of memory!");
             index.isolate(executor_default_t{threads});
             return result.completed;
         },
diff --git a/python/scripts/test_gil_release.py b/python/scripts/test_gil_release.py
@@ -54,19 +54,19 @@ def _big_random_batch(n: int, ndim: int, seed: int = 42):
     return keys, vectors
 
 
-# How many background-counter ticks we expect during a multi-hundred-millisecond
-# add. Modern hardware loops the trivial `counter[0] += 1` body well over a
-# million times per second, so 100k is a conservative floor that comfortably
-# distinguishes "GIL released" from "GIL held".
-_GIL_TICK_FLOOR = 100_000
+# Lower bound on background-counter ticks during one short USearch op. Modern
+# hardware loops the trivial `counter[0] += 1` body well over a million times
+# per second; 10k is a conservative floor that distinguishes "GIL released"
+# from "GIL held" without making the test slow on tiny inputs.
+_GIL_TICK_FLOOR = 10_000
 
 
 def test_gil_released_during_add():
     start, stop_and_join, count = _background_counter()
     start()
 
-    idx = Index(ndim=128, dtype="f32")
-    keys, vectors = _big_random_batch(8_000, 128)
+    idx = Index(ndim=64, dtype="f32")
+    keys, vectors = _big_random_batch(2_000, 64)
 
     before = count()
     t0 = time.perf_counter()
@@ -83,15 +83,14 @@ def test_gil_released_during_add():
 
 
 def test_gil_released_during_search():
-    idx = Index(ndim=128, dtype="f32")
-    keys, vectors = _big_random_batch(5_000, 128)
+    idx = Index(ndim=64, dtype="f32")
+    keys, vectors = _big_random_batch(1_500, 64)
     idx.add(keys, vectors, threads=4)
 
     start, stop_and_join, count = _background_counter()
     start()
 
-    # Many query vectors so the search is meaningfully long
-    _, queries = _big_random_batch(2_000, 128, seed=7)
+    _, queries = _big_random_batch(1_000, 64, seed=7)
     before = count()
     t0 = time.perf_counter()
     idx.search(queries, 10, threads=4)
@@ -101,8 +100,7 @@ def test_gil_released_during_search():
 
     advancement = after - before
     assert advancement > _GIL_TICK_FLOOR, (
-        f"GIL appears held during search: only {advancement:,} background ticks "
-        f"during a {elapsed:.3f}s search."
+        f"GIL appears held during search: only {advancement:,} background ticks during a {elapsed:.3f}s search."
     )
 
 
@@ -111,8 +109,8 @@ def test_progress_callback_fires_and_completes():
     the GIL before invoking the Python callable. It must be able to mutate a
     Python list and return a bool without crashing."""
 
-    idx = Index(ndim=128, dtype="f32")
-    keys, vectors = _big_random_batch(8_000, 128)
+    idx = Index(ndim=64, dtype="f32")
+    keys, vectors = _big_random_batch(2_000, 64)
 
     invocations = []
 
@@ -136,8 +134,8 @@ def test_progress_callback_can_cancel():
     """Returning `False` from the progress callback terminates the op cleanly
     and surfaces as a Python `RuntimeError` - no segfault, no UB."""
 
-    idx = Index(ndim=128, dtype="f32")
-    keys, vectors = _big_random_batch(30_000, 128)
+    idx = Index(ndim=64, dtype="f32")
+    keys, vectors = _big_random_batch(10_000, 64)
 
     seen = []
 
@@ -162,8 +160,8 @@ def test_gil_released_with_progress_callback():
     start, stop_and_join, count = _background_counter()
     start()
 
-    idx = Index(ndim=128, dtype="f32")
-    keys, vectors = _big_random_batch(8_000, 128)
+    idx = Index(ndim=64, dtype="f32")
+    keys, vectors = _big_random_batch(2_000, 64)
 
     invocations = []
 
@@ -177,7 +175,53 @@ def progress(done: int, total: int) -> bool:
     stop_and_join()
 
     assert after - before > _GIL_TICK_FLOOR, (
-        "background thread didn't advance during add - GIL likely held while "
-        "callback was active"
+        "background thread didn't advance during add - GIL likely held while callback was active"
     )
     assert invocations and invocations[-1] == (len(keys), len(keys))
+
+
+def test_concurrent_access_serializes_safely():
+    """The documented contract is one Python thread per index; the binding
+    enforces it with an internal mutex so accidental concurrent access from
+    multiple Python threads serializes instead of crashing. Mix adds with
+    disjoint key ranges, searches, and lock-free getters; assert no thread
+    errors and that all keys from all `add` workers landed in the index."""
+
+    idx = Index(ndim=64, dtype="f32")
+    per_thread = 500
+    barrier = threading.Barrier(6)
+    errors: list[str] = []
+
+    def run(target, *args):
+        def wrapped():
+            try:
+                barrier.wait()
+                target(*args)
+            except Exception as e:
+                errors.append(f"{type(e).__name__}: {e}")
+        return threading.Thread(target=wrapped)
+
+    def adder(tid: int):
+        rng = np.random.default_rng(seed=tid)
+        base = tid * per_thread
+        keys = np.arange(base, base + per_thread, dtype=np.uint64)
+        idx.add(keys, rng.standard_normal((per_thread, 64), dtype=np.float32))
+
+    def searcher(tid: int):
+        rng = np.random.default_rng(seed=1000 + tid)
+        for _ in range(50):
+            idx.search(rng.standard_normal((1, 64), dtype=np.float32), 3)
+
+    def getters():
+        for i in range(1000):
+            _ = i in idx
+            _ = len(idx)
+
+    threads = [run(adder, t) for t in range(3)] + [run(searcher, t) for t in range(2)] + [run(getters)]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
+
+    assert not errors, "thread errors:\n  " + "\n  ".join(errors)
+    assert len(idx) == 3 * per_thread