@@ -75,11 +75,24 @@ struct dense_index_py_t : public index_dense_t {
7575 using native_t ::search;
7676 using native_t ::size;
7777
78+ // Serializes Python-thread access to the underlying C++ index. The native
79+ // `index_dense_t` assumes a single owning Python thread (it carries
80+ // per-worker `cast_buffer_` slots indexed by an executor-local `thread_idx`
81+ // that the binding picks for each call). Multiple Python threads calling
82+ // a heavy op on the same index would otherwise collide on those slots.
83+ // Locked around every binding entry point that releases the GIL.
84+ //
85+ // Held via `unique_ptr` so that the wrapper remains move-constructible -
86+ // `std::mutex` itself is neither copyable nor movable, and pybind11's
87+ // return-by-value factories require movability.
88+ mutable std::unique_ptr<std::mutex> mutex_ptr_ = std::make_unique<std::mutex>();
89+
7890 dense_index_py_t (native_t && base) : index_dense_t (std::move(base)) {}
7991};
8092
8193struct dense_indexes_py_t {
8294 std::vector<std::shared_ptr<dense_index_py_t >> shards_;
95+ mutable std::unique_ptr<std::mutex> mutex_ptr_ = std::make_unique<std::mutex>();
8396
8497 void merge (std::shared_ptr<dense_index_py_t > shard) { shards_.push_back (shard); }
8598 std::size_t bytes_per_vector () const noexcept { return shards_.empty () ? 0 : shards_[0 ]->bytes_per_vector (); }
@@ -92,7 +105,12 @@ struct dense_indexes_py_t {
92105
93106 shards_.reserve (shards_.size () + paths.size ());
94107 std::mutex shards_mutex;
108+ // Release the GIL *before* taking the per-index mutex so a Python
109+ // thread waiting on the mutex doesn't hold the GIL - otherwise a
110+ // worker thread in the current owner would block forever in
111+ // `gil_scoped_acquire`.
95112 py::gil_scoped_release release;
113+ std::unique_lock<std::mutex> lock (*mutex_ptr_);
96114 executor_default_t {threads}.dynamic (paths.size (), [&](std::size_t , std::size_t task_idx) {
97115 index_dense_t index = index_dense_t::make (paths[task_idx].c_str (), view);
98116 if (!index)
@@ -192,6 +210,9 @@ static void add_typed_to_index( //
192210
193211 {
194212 py::gil_scoped_release release;
213+ std::unique_lock<std::mutex> lock (*index.mutex_ptr_ );
214+ if (!index.try_reserve (index_limits_t (ceil2 (index.size () + vectors_count), threads)))
215+ throw std::invalid_argument (" Out of memory!" );
195216 executor_default_t {threads}.dynamic (vectors_count, [&](std::size_t thread_idx, std::size_t task_idx) {
196217 dense_key_t key = *reinterpret_cast <dense_key_t const *>(keys_data + task_idx * keys_info.strides [0 ]);
197218 scalar_at const * vector =
@@ -259,8 +280,10 @@ static void add_many_to_index( //
259280
260281 if (!threads)
261282 threads = std::thread::hardware_concurrency ();
262- if (!index.try_reserve (index_limits_t (ceil2 (index.size () + vectors_count), threads)))
263- throw std::invalid_argument (" Out of memory!" );
283+
284+ // `add_typed_to_index` does the `try_reserve` + executor work inside its
285+ // own GIL-released, mutex-locked region; we just dispatch on the scalar
286+ // kind here.
264287
265288 // clang-format off
266289 scalar_kind_t kind = (scalar_kind != scalar_kind_t ::unknown_k)
@@ -300,8 +323,6 @@ static void search_typed( //
300323
301324 if (!threads)
302325 threads = std::thread::hardware_concurrency ();
303- if (!index.try_reserve (index_limits_t (index.size (), threads)))
304- throw std::invalid_argument (" Out of memory!" );
305326
306327 // Progress status
307328 progress_t progress_{progress};
@@ -310,6 +331,9 @@ static void search_typed( //
310331 atomic_error_t atomic_error{nullptr };
311332 {
312333 py::gil_scoped_release release;
334+ std::unique_lock<std::mutex> lock (*index.mutex_ptr_ );
335+ if (!index.try_reserve (index_limits_t (index.size (), threads)))
336+ throw std::invalid_argument (" Out of memory!" );
313337 executor_default_t {threads}.dynamic (vectors_count, [&](std::size_t thread_idx, std::size_t task_idx) {
314338 scalar_at const * vector = (scalar_at const *)(vectors_data + task_idx * vectors_info.strides [0 ]);
315339 dense_search_result_t result = index.search (vector, wanted, thread_idx, exact);
@@ -379,6 +403,7 @@ static void search_typed( //
379403 atomic_error_t atomic_error{nullptr };
380404 {
381405 py::gil_scoped_release release;
406+ std::unique_lock<std::mutex> lock (*indexes.mutex_ptr_ );
382407 executor_default_t {threads}.dynamic (indexes.shards_ .size (), [&](std::size_t thread_idx, std::size_t task_idx) {
383408 dense_index_py_t & index = *indexes.shards_ [task_idx].get ();
384409
@@ -760,6 +785,7 @@ static py::tuple cluster_vectors( //
760785 : numpy_string_to_kind (queries_info.format );
761786 {
762787 py::gil_scoped_release release;
788+ std::unique_lock<std::mutex> lock (*index.mutex_ptr_ );
763789 switch (kind) {
764790 case scalar_kind_t ::f64_k: cluster_result = index.cluster (queries_begin.as <f64_t const >(), queries_end.as <f64_t const >(), config, keys_ptr, distances_ptr, executor, progress_t {progress}); break ;
765791 case scalar_kind_t ::f32_k: cluster_result = index.cluster (queries_begin.as <f32_t const >(), queries_end.as <f32_t const >(), config, keys_ptr, distances_ptr, executor, progress_t {progress}); break ;
@@ -834,6 +860,7 @@ static py::tuple cluster_keys( //
834860 dense_clustering_result_t cluster_result;
835861 {
836862 py::gil_scoped_release release;
863+ std::unique_lock<std::mutex> lock (*index.mutex_ptr_ );
837864 cluster_result =
838865 index.cluster (queries_begin, queries_end, config, keys_ptr, distances_ptr, executor, progress_t {progress});
839866 }
@@ -871,7 +898,11 @@ static std::unordered_map<dense_key_t, dense_key_t> join_index( //
871898 executor_default_t executor{threads};
872899 join_result_t result;
873900 {
901+ // Lock the receiver `a`; `b` is read-only from this side. Concurrent
902+ // bidirectional `join(a, b)` and `join(b, a)` from two Python threads
903+ // is unsupported.
874904 py::gil_scoped_release release;
905+ std::unique_lock<std::mutex> lock (*a.mutex_ptr_ );
875906 result = a.join (b, config, a_to_b, b_to_a, executor, progress_t {progress});
876907 }
877908 forward_error (result);
@@ -893,10 +924,11 @@ static void compact_index(dense_index_py_t& index, std::size_t threads, progress
893924
894925 if (!threads)
895926 threads = std::thread::hardware_concurrency ();
896- if (!index.try_reserve (index_limits_t (index.size (), threads)))
897- throw std::invalid_argument (" Out of memory!" );
898927
899928 py::gil_scoped_release release;
929+ std::unique_lock<std::mutex> lock (*index.mutex_ptr_ );
930+ if (!index.try_reserve (index_limits_t (index.size (), threads)))
931+ throw std::invalid_argument (" Out of memory!" );
900932 index.compact (executor_default_t {threads}, progress_t {progress});
901933}
902934
@@ -1316,10 +1348,11 @@ PYBIND11_MODULE(compiled, m, py::mod_gil_not_used()) {
13161348
13171349 if (!threads)
13181350 threads = std::thread::hardware_concurrency ();
1319- if (!index.try_reserve (index_limits_t (index.size (), threads)))
1320- throw std::invalid_argument (" Out of memory!" );
13211351
13221352 py::gil_scoped_release release;
1353+ std::unique_lock<std::mutex> lock (*index.mutex_ptr_ );
1354+ if (!index.try_reserve (index_limits_t (index.size (), threads)))
1355+ throw std::invalid_argument (" Out of memory!" );
13231356 index.isolate (executor_default_t {threads});
13241357 return result.completed ;
13251358 },
@@ -1336,10 +1369,11 @@ PYBIND11_MODULE(compiled, m, py::mod_gil_not_used()) {
13361369
13371370 if (!threads)
13381371 threads = std::thread::hardware_concurrency ();
1339- if (!index.try_reserve (index_limits_t (index.size (), threads)))
1340- throw std::invalid_argument (" Out of memory!" );
13411372
13421373 py::gil_scoped_release release;
1374+ std::unique_lock<std::mutex> lock (*index.mutex_ptr_ );
1375+ if (!index.try_reserve (index_limits_t (index.size (), threads)))
1376+ throw std::invalid_argument (" Out of memory!" );
13431377 index.isolate (executor_default_t {threads});
13441378 return result.completed ;
13451379 },
0 commit comments