From 981191f789cba2a74fe2ba7b5e1da96f658b069c Mon Sep 17 00:00:00 2001
From: Kaur Matas <33095685+kmatasfp@users.noreply.github.com>
Date: Tue, 2 Jun 2026 13:38:36 -0700
Subject: [PATCH 01/60] feat: cloud-mode TestMode::Cloud for benchmarks with
 best-effort cleanup (#3596)

---
 golem-test-framework/src/benchmark/config.rs  |  82 ++-
 golem-test-framework/src/benchmark/mod.rs     |   1 +
 golem-test-framework/src/benchmark/results.rs |   9 +
 .../component_compilation_service/mod.rs      |   1 +
 .../unavailable.rs                            |  35 ++
 .../src/components/rdb/mod.rs                 |   1 +
 .../src/components/rdb/unavailable.rs         |  31 +
 .../src/components/redis/mod.rs               |   1 +
 .../src/components/redis/unavailable.rs       |  43 ++
 .../src/components/redis_monitor/mod.rs       |   1 +
 .../components/redis_monitor/unavailable.rs   |  29 +
 .../src/components/registry_service/cloud.rs  | 167 ++++++
 .../src/components/registry_service/mod.rs    |   1 +
 .../src/components/shard_manager/mod.rs       |   1 +
 .../components/shard_manager/unavailable.rs   |  56 ++
 .../components/worker_executor_cluster/mod.rs |   1 +
 .../worker_executor_cluster/unavailable.rs    |  63 +++
 .../src/components/worker_service/cloud.rs    | 113 ++++
 .../src/components/worker_service/mod.rs      |   1 +
 golem-test-framework/src/config/benchmark.rs  | 166 +++++-
 golem-test-framework/src/config/dsl_impl.rs   |   5 +-
 golem-test-framework/src/config/mod.rs        |  14 +-
 integration-tests/src/benchmarks/all.rs       | 220 +++++++-
 integration-tests/src/benchmarks/cleanup.rs   | 529 ++++++++++++++++++
 .../src/benchmarks/cold_start_unknown.rs      |  25 +-
 .../src/benchmarks/durability_overhead.rs     |   6 +-
 integration-tests/src/benchmarks/latency.rs   |   8 +-
 integration-tests/src/benchmarks/mod.rs       |  10 +-
 integration-tests/src/benchmarks/sleep.rs     |   8 +-
 .../src/benchmarks/throughput.rs              | 204 ++++---
 30 files changed, 1741 insertions(+), 91 deletions(-)
 create mode 100644 golem-test-framework/src/components/component_compilation_service/unavailable.rs
 create mode 100644 golem-test-framework/src/components/rdb/unavailable.rs
 create mode 100644 golem-test-framework/src/components/redis/unavailable.rs
 create mode 100644 golem-test-framework/src/components/redis_monitor/unavailable.rs
 create mode 100644 golem-test-framework/src/components/registry_service/cloud.rs
 create mode 100644 golem-test-framework/src/components/shard_manager/unavailable.rs
 create mode 100644 golem-test-framework/src/components/worker_executor_cluster/unavailable.rs
 create mode 100644 golem-test-framework/src/components/worker_service/cloud.rs
 create mode 100644 integration-tests/src/benchmarks/cleanup.rs
diff --git a/golem-test-framework/src/benchmark/config.rs b/golem-test-framework/src/benchmark/config.rs
index c011ac65b0..0d172baa24 100644
--- a/golem-test-framework/src/benchmark/config.rs
+++ b/golem-test-framework/src/benchmark/config.rs
@@ -116,7 +116,7 @@ pub struct BenchmarkSuiteItem {
 impl BenchmarkSuiteItem {
     pub fn runs(&self, mode: &TestMode) -> Vec<RunConfig> {
         let cluster_size: Vec<usize> = match mode {
-            TestMode::Provided { .. } => {
+            TestMode::Provided { .. } | TestMode::Cloud { .. } => {
                 vec![0]
             }
             _ => self
@@ -163,3 +163,83 @@ impl BenchmarkSuiteItem {
         res
     }
 }
+
+/// Smoke tests for cloud-mode wiring that do not require running services.
+///
+/// For a full end-to-end smoke test that exercises actual HTTP clients,
+/// cleanup, and the benchmark API contract, run the binary directly against a
+/// local Spawned cluster:
+///
+/// ```text
+/// cargo run --bin benchmarks -- benchmark cold-start-unknown-small \
+///   --size 1 --iterations 1 --length 0 \
+///   cloud \
+///   --api-url http://localhost:8081 \
+///   --apps-base-domain golem.cloud \
+///   --admin-account-id <uuid> \
+///   --admin-account-email <email> \
+///   --admin-account-token <token> \
+///   --builtin-plugin-owner-account-id <uuid> \
+///   --default-plan-id <uuid>
+/// ```
+#[cfg(test)]
+mod cloud_mode_smoke {
+    use super::*;
+    use test_r::test;
+    use url::Url;
+    use uuid::Uuid;
+
+    fn cloud_mode() -> TestMode {
+        TestMode::Cloud {
+            api_url: Url::parse("https://release.dev-api.golem.cloud").unwrap(),
+            apps_base_domain: "apps.dev.golem.cloud".to_string(),
+            admin_account_token: "test-token".to_string(),
+            builtin_plugin_owner_account_id: Uuid::nil(),
+            default_plan_id: Uuid::nil(),
+            shard_manager_grpc_host: None,
+            shard_manager_grpc_port: None,
+            component_directory: "test-components".to_string(),
+        }
+    }
+
+    /// Cloud mode always returns exactly one `RunConfig` with `cluster_size=0`,
+    /// regardless of how many `cluster_size` values the suite item specifies.
+    #[test]
+    fn runs_returns_single_cluster_size_zero_run() {
+        let mode = cloud_mode();
+        let item = BenchmarkSuiteItem {
+            name: "cold-start-unknown-small".to_string(),
+            iterations: 3,
+            cluster_size: vec![1, 3, 5], // must be ignored in cloud mode
+            size: vec![10],
+            length: vec![100],
+            disable_compilation_cache: None,
+        };
+        let runs = item.runs(&mode);
+        assert_eq!(runs.len(), 1, "cloud mode ignores cluster_size variations");
+        assert_eq!(runs[0].cluster_size, 0, "cloud mode cluster_size must be 0");
+        assert_eq!(runs[0].size, 10);
+        assert_eq!(runs[0].length, 100);
+    }
+
+    /// Multiple size and length combinations still expand normally; only
+    /// `cluster_size` is collapsed.
+    #[test]
+    fn runs_expands_size_and_length_but_not_cluster_size() {
+        let mode = cloud_mode();
+        let item = BenchmarkSuiteItem {
+            name: "latency-small".to_string(),
+            iterations: 1,
+            cluster_size: vec![1, 3],
+            size: vec![5, 10],
+            length: vec![50, 100],
+            disable_compilation_cache: None,
+        };
+        let runs = item.runs(&mode);
+        // 1 (collapsed cluster_size) × 2 sizes × 2 lengths = 4 runs
+        assert_eq!(runs.len(), 4);
+        for r in &runs {
+            assert_eq!(r.cluster_size, 0);
+        }
+    }
+}
diff --git a/golem-test-framework/src/benchmark/mod.rs b/golem-test-framework/src/benchmark/mod.rs
index 1f349afddd..fd246b2be7 100644
--- a/golem-test-framework/src/benchmark/mod.rs
+++ b/golem-test-framework/src/benchmark/mod.rs
@@ -301,6 +301,7 @@ impl<B: Benchmark> BenchmarkApi for B {
             description: B::description().to_string(),
             runs,
             results,
+            run_id: None,
         }
     }
 }
diff --git a/golem-test-framework/src/benchmark/results.rs b/golem-test-framework/src/benchmark/results.rs
index 1cb0f329b6..a309d1d5e5 100644
--- a/golem-test-framework/src/benchmark/results.rs
+++ b/golem-test-framework/src/benchmark/results.rs
@@ -495,6 +495,10 @@ pub struct BenchmarkSuiteResult {
     pub environment: String,
     pub version: String,
     pub timestamp: DateTime<Utc>,
+    /// Suite-level run-id. Set in cloud mode to `bench-{run_id}` to allow
+    /// cross-run correlation and garbage collection of orphaned state.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub run_id: Option<String>,
     pub results: Vec<BenchmarkResult>,
 }
 
@@ -530,6 +534,7 @@ impl BenchmarkSuiteResult {
             environment,
             version: golem_common::golem_version().to_string(),
             timestamp: Utc::now(),
+            run_id: None,
             results: vec![],
         }
     }
@@ -606,6 +611,10 @@ pub struct BenchmarkResult {
     pub description: String,
     pub runs: Vec<RunConfig>,
     pub results: Vec<BenchmarkRunResult>,
+    /// Suite-level run-id. Set in cloud mode to `bench-{run_id}` to allow
+    /// cross-run correlation and garbage collection of orphaned state.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub run_id: Option<String>,
 }
 
 impl BenchmarkResult {
diff --git a/golem-test-framework/src/components/component_compilation_service/mod.rs b/golem-test-framework/src/components/component_compilation_service/mod.rs
index f80d2f84d6..50da099698 100644
--- a/golem-test-framework/src/components/component_compilation_service/mod.rs
+++ b/golem-test-framework/src/components/component_compilation_service/mod.rs
@@ -21,6 +21,7 @@ use tracing::Level;
 
 pub mod provided;
 pub mod spawned;
+pub mod unavailable;
 
 #[async_trait]
 pub trait ComponentCompilationService: Send + Sync {
diff --git a/golem-test-framework/src/components/component_compilation_service/unavailable.rs b/golem-test-framework/src/components/component_compilation_service/unavailable.rs
new file mode 100644
index 0000000000..fb355cd0b3
--- /dev/null
+++ b/golem-test-framework/src/components/component_compilation_service/unavailable.rs
@@ -0,0 +1,35 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::ComponentCompilationService;
+use async_trait::async_trait;
+
+/// A `ComponentCompilationService` that is not directly reachable. Used in
+/// cloud mode, where it is an internal cluster component with no external
+/// exposure. `kill` is a no-op so that `kill_all()` completes; operational
+/// methods panic with a clear message.
+pub struct UnavailableComponentCompilationService;
+
+#[async_trait]
+impl ComponentCompilationService for UnavailableComponentCompilationService {
+    fn grpc_host(&self) -> String {
+        panic!("component_compilation_service() is not available in cloud mode");
+    }
+
+    fn grpc_port(&self) -> u16 {
+        panic!("component_compilation_service() is not available in cloud mode");
+    }
+
+    async fn kill(&self) {}
+}
diff --git a/golem-test-framework/src/components/rdb/mod.rs b/golem-test-framework/src/components/rdb/mod.rs
index 5f1b5c7fb8..ce8863c10e 100644
--- a/golem-test-framework/src/components/rdb/mod.rs
+++ b/golem-test-framework/src/components/rdb/mod.rs
@@ -29,6 +29,7 @@ pub mod docker_mysql;
 pub mod docker_postgres;
 pub mod provided_postgres;
 pub mod sqlite;
+pub mod unavailable;
 
 #[async_trait]
 pub trait Rdb: Send + Sync {
diff --git a/golem-test-framework/src/components/rdb/unavailable.rs b/golem-test-framework/src/components/rdb/unavailable.rs
new file mode 100644
index 0000000000..1df99efe70
--- /dev/null
+++ b/golem-test-framework/src/components/rdb/unavailable.rs
@@ -0,0 +1,31 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{DbInfo, Rdb};
+use async_trait::async_trait;
+
+/// An `Rdb` that is not directly reachable. Used in cloud mode, where the
+/// database is an internal cluster component with no external exposure.
+/// Lifecycle teardown (`kill`) is a no-op so that `kill_all()` completes;
+/// operational methods panic with a clear message.
+pub struct UnavailableRdb;
+
+#[async_trait]
+impl Rdb for UnavailableRdb {
+    fn info(&self) -> DbInfo {
+        panic!("rdb() is not available in cloud mode");
+    }
+
+    async fn kill(&self) {}
+}
diff --git a/golem-test-framework/src/components/redis/mod.rs b/golem-test-framework/src/components/redis/mod.rs
index df14595c7a..62346ec293 100644
--- a/golem-test-framework/src/components/redis/mod.rs
+++ b/golem-test-framework/src/components/redis/mod.rs
@@ -20,6 +20,7 @@ use tracing::info;
 pub mod provided;
 pub mod spawned;
 pub mod spawned_tls;
+pub mod unavailable;
 
 #[async_trait]
 pub trait Redis: Send + Sync {
diff --git a/golem-test-framework/src/components/redis/unavailable.rs b/golem-test-framework/src/components/redis/unavailable.rs
new file mode 100644
index 0000000000..0f24489fe9
--- /dev/null
+++ b/golem-test-framework/src/components/redis/unavailable.rs
@@ -0,0 +1,43 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::Redis;
+use async_trait::async_trait;
+
+/// A `Redis` that is not directly reachable. Used in cloud mode, where Redis
+/// is an internal cluster component with no external exposure. `kill` is a
+/// no-op so that `kill_all()` completes; operational methods panic with a
+/// clear message.
+pub struct UnavailableRedis;
+
+#[async_trait]
+impl Redis for UnavailableRedis {
+    fn assert_valid(&self) {
+        panic!("redis() is not available in cloud mode");
+    }
+
+    fn private_host(&self) -> String {
+        panic!("redis() is not available in cloud mode");
+    }
+
+    fn private_port(&self) -> u16 {
+        panic!("redis() is not available in cloud mode");
+    }
+
+    fn prefix(&self) -> &str {
+        panic!("redis() is not available in cloud mode");
+    }
+
+    async fn kill(&self) {}
+}
diff --git a/golem-test-framework/src/components/redis_monitor/mod.rs b/golem-test-framework/src/components/redis_monitor/mod.rs
index eb73fe0e0d..2a24665ec5 100644
--- a/golem-test-framework/src/components/redis_monitor/mod.rs
+++ b/golem-test-framework/src/components/redis_monitor/mod.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 pub mod spawned;
+pub mod unavailable;
 
 pub trait RedisMonitor: Send + Sync {
     fn assert_valid(&self);
diff --git a/golem-test-framework/src/components/redis_monitor/unavailable.rs b/golem-test-framework/src/components/redis_monitor/unavailable.rs
new file mode 100644
index 0000000000..bdde53d231
--- /dev/null
+++ b/golem-test-framework/src/components/redis_monitor/unavailable.rs
@@ -0,0 +1,29 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::RedisMonitor;
+
+/// A `RedisMonitor` that is not directly reachable. Used in cloud mode, where
+/// Redis is an internal cluster component with no external exposure. `kill` is
+/// a no-op so that `kill_all()` completes; operational methods panic with a
+/// clear message.
+pub struct UnavailableRedisMonitor;
+
+impl RedisMonitor for UnavailableRedisMonitor {
+    fn assert_valid(&self) {
+        panic!("redis_monitor() is not available in cloud mode");
+    }
+
+    fn kill(&self) {}
+}
diff --git a/golem-test-framework/src/components/registry_service/cloud.rs b/golem-test-framework/src/components/registry_service/cloud.rs
new file mode 100644
index 0000000000..79e5d03935
--- /dev/null
+++ b/golem-test-framework/src/components/registry_service/cloud.rs
@@ -0,0 +1,167 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::RegistryService;
+use async_trait::async_trait;
+use golem_client::api::RegistryServiceClientLive;
+use golem_client::{Context, Security};
+use golem_common::model::account::{AccountEmail, AccountId};
+use golem_common::model::auth::TokenSecret;
+use golem_common::model::plan::PlanId;
+use std::time::Duration;
+use tokio::sync::OnceCell;
+use tracing::info;
+use url::Url;
+
+/// Registry-service client for cloud mode.
+///
+/// In the deployed Golem environment both registry-service and worker-service
+/// are reachable behind a single Gateway API hostname
+/// (e.g. `https://release.dev-api.golem.cloud`). This struct holds that shared
+/// `api_url`; routing to the correct backend service is done by the Gateway
+/// based on URL path.
+pub struct CloudRegistryService {
+    api_url: Url,
+    admin_token: TokenSecret,
+    builtin_plugin_owner_account_id: AccountId,
+    default_plan_id: PlanId,
+    base_http_client: OnceCell<reqwest_middleware::ClientWithMiddleware>,
+}
+
+impl CloudRegistryService {
+    pub fn new(
+        api_url: Url,
+        admin_token: TokenSecret,
+        builtin_plugin_owner_account_id: AccountId,
+        default_plan_id: PlanId,
+    ) -> Self {
+        info!("Using cloud API gateway at {api_url}");
+        Self {
+            api_url,
+            admin_token,
+            builtin_plugin_owner_account_id,
+            default_plan_id,
+            base_http_client: OnceCell::new(),
+        }
+    }
+}
+
+/// Constructs the tuned HTTP client for cloud-mode benchmark connections.
+///
+/// Settings: large connection pool (1024), 90-second idle timeout, TCP
+/// nodelay, and 180-second request timeout.
+///
+/// Note: `http2_prior_knowledge()` is deliberately **not** set. Prior
+/// knowledge is for h2c (HTTP/2 over plain HTTP). All cloud endpoints are
+/// HTTPS, where HTTP/2 is negotiated through ALPN during the TLS handshake
+/// (TLS termination happens at Envoy). Setting prior knowledge would bypass
+/// ALPN and can cause protocol errors.
+pub fn new_cloud_reqwest_client() -> reqwest_middleware::ClientWithMiddleware {
+    let client = reqwest::ClientBuilder::new()
+        .pool_max_idle_per_host(1024)
+        .pool_idle_timeout(Duration::from_secs(90))
+        .tcp_nodelay(true)
+        .timeout(Duration::from_secs(180))
+        .build()
+        .expect("Failed to build cloud HTTP client");
+    reqwest_middleware::ClientBuilder::new(client)
+        .with(reqwest_tracing::TracingMiddleware::default())
+        .build()
+}
+
+#[async_trait]
+impl RegistryService for CloudRegistryService {
+    fn http_host(&self) -> String {
+        self.api_url.host_str().unwrap_or("localhost").to_string()
+    }
+
+    fn http_port(&self) -> u16 {
+        self.api_url.port_or_known_default().unwrap_or(443)
+    }
+
+    fn grpc_host(&self) -> String {
+        panic!("grpc_host() is not available through the Gateway in cloud mode");
+    }
+
+    fn grpc_port(&self) -> u16 {
+        panic!("grpc_port() is not available through the Gateway in cloud mode");
+    }
+
+    fn admin_account_id(&self) -> AccountId {
+        AccountId(uuid::Uuid::nil())
+    }
+
+    fn admin_account_email(&self) -> AccountEmail {
+        AccountEmail::new(String::new())
+    }
+
+    fn admin_account_token(&self) -> TokenSecret {
+        self.admin_token.clone()
+    }
+
+    fn builtin_plugin_owner_account_id(&self) -> AccountId {
+        self.builtin_plugin_owner_account_id
+    }
+
+    fn default_plan(&self) -> PlanId {
+        self.default_plan_id
+    }
+
+    fn low_fuel_plan(&self) -> PlanId {
+        panic!(
+            "low_fuel_plan is not supported in cloud mode; \
+             the benchmark calling this method requires a local or provided cluster"
+        );
+    }
+
+    fn low_disk_space_plan(&self) -> PlanId {
+        panic!(
+            "low_disk_space_plan is not supported in cloud mode; \
+             the benchmark calling this method requires a local or provided cluster"
+        );
+    }
+
+    fn low_http_calls_plan(&self) -> PlanId {
+        panic!(
+            "low_http_calls_plan is not supported in cloud mode; \
+             the benchmark calling this method requires a local or provided cluster"
+        );
+    }
+
+    fn low_rpc_calls_plan(&self) -> PlanId {
+        panic!(
+            "low_rpc_calls_plan is not supported in cloud mode; \
+             the benchmark calling this method requires a local or provided cluster"
+        );
+    }
+
+    async fn kill(&self) {}
+
+    async fn base_http_client(&self) -> reqwest_middleware::ClientWithMiddleware {
+        self.base_http_client
+            .get_or_init(|| async { new_cloud_reqwest_client() })
+            .await
+            .clone()
+    }
+
+    async fn client(&self, token: &TokenSecret) -> RegistryServiceClientLive {
+        RegistryServiceClientLive {
+            context: Context {
+                client: self.base_http_client().await,
+                base_url: self.api_url.clone(),
+                security_token: Security::Bearer(token.secret().to_string()),
+            },
+        }
+    }
+}
diff --git a/golem-test-framework/src/components/registry_service/mod.rs b/golem-test-framework/src/components/registry_service/mod.rs
index d38f88577d..42b0b9ddfd 100644
--- a/golem-test-framework/src/components/registry_service/mod.rs
+++ b/golem-test-framework/src/components/registry_service/mod.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+pub mod cloud;
 pub mod provided;
 pub mod spawned;
 
diff --git a/golem-test-framework/src/components/shard_manager/mod.rs b/golem-test-framework/src/components/shard_manager/mod.rs
index 5245865e4b..67b10051bf 100644
--- a/golem-test-framework/src/components/shard_manager/mod.rs
+++ b/golem-test-framework/src/components/shard_manager/mod.rs
@@ -14,6 +14,7 @@
 
 pub mod provided;
 pub mod spawned;
+pub mod unavailable;
 
 use super::rdb::Rdb;
 use super::registry_service::RegistryService;
diff --git a/golem-test-framework/src/components/shard_manager/unavailable.rs b/golem-test-framework/src/components/shard_manager/unavailable.rs
new file mode 100644
index 0000000000..834dfb8d2c
--- /dev/null
+++ b/golem-test-framework/src/components/shard_manager/unavailable.rs
@@ -0,0 +1,56 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::ShardManager;
+use async_trait::async_trait;
+use golem_common::model::RoutingTable;
+
+/// A `ShardManager` that is not directly reachable. Used in cloud mode when no
+/// shard-manager port-forward is configured; pass `--shard-manager-grpc-host`
+/// and `--shard-manager-grpc-port` to use a real `ProvidedShardManager`
+/// instead.
+///
+/// `kill`/`restart` are no-ops. `get_routing_table()` returns an error so that
+/// callers (e.g. the throughput benchmark) can fall back to the unlabeled
+/// single-bucket mode. The host/port accessors panic with a clear message.
+pub struct UnavailableShardManager;
+
+#[async_trait]
+impl ShardManager for UnavailableShardManager {
+    fn grpc_host(&self) -> String {
+        panic!(
+            "shard_manager() requires --shard-manager-grpc-host and \
+             --shard-manager-grpc-port to be configured in cloud mode"
+        );
+    }
+
+    fn grpc_port(&self) -> u16 {
+        panic!(
+            "shard_manager() requires --shard-manager-grpc-host and \
+             --shard-manager-grpc-port to be configured in cloud mode"
+        );
+    }
+
+    async fn kill(&self) {}
+
+    async fn restart(&self, _number_of_shards_override: Option<usize>) {}
+
+    async fn get_routing_table(&self) -> crate::Result<RoutingTable> {
+        Err(anyhow::anyhow!(
+            "shard_manager is not configured in cloud mode; \
+             pass --shard-manager-grpc-host and --shard-manager-grpc-port \
+             to enable routing table fetch and local/remote RPC labeling"
+        ))
+    }
+}
diff --git a/golem-test-framework/src/components/worker_executor_cluster/mod.rs b/golem-test-framework/src/components/worker_executor_cluster/mod.rs
index 2dc8e21745..e1db10b237 100644
--- a/golem-test-framework/src/components/worker_executor_cluster/mod.rs
+++ b/golem-test-framework/src/components/worker_executor_cluster/mod.rs
@@ -18,6 +18,7 @@ use std::sync::Arc;
 
 pub mod provided;
 pub mod spawned;
+pub mod unavailable;
 
 #[async_trait]
 pub trait WorkerExecutorCluster: Send + Sync {
diff --git a/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs b/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs
new file mode 100644
index 0000000000..53a5cc87be
--- /dev/null
+++ b/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs
@@ -0,0 +1,63 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::components::worker_executor::WorkerExecutor;
+use crate::components::worker_executor_cluster::WorkerExecutorCluster;
+use async_trait::async_trait;
+use std::sync::Arc;
+
+/// A `WorkerExecutorCluster` whose individual executors are not directly
+/// reachable. Used in cloud mode, where executors are internal cluster
+/// components with no external exposure.
+///
+/// Lifecycle teardown methods (`kill_all`, `restart_all`) are no-ops so that
+/// `kill_all()` completes. `is_running()` returns `true` so that
+/// `ensure_all_deps_running()` is a no-op. Per-executor operations panic with a
+/// clear message.
+pub struct UnavailableWorkerExecutorCluster;
+
+#[async_trait]
+impl WorkerExecutorCluster for UnavailableWorkerExecutorCluster {
+    fn size(&self) -> usize {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    async fn kill_all(&self) {}
+
+    async fn restart_all(&self) {}
+
+    async fn stop(&self, _index: usize) {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    async fn start(&self, _index: usize) {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    fn to_vec(&self) -> Vec<Arc<dyn WorkerExecutor>> {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    async fn stopped_indices(&self) -> Vec<usize> {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    async fn started_indices(&self) -> Vec<usize> {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    async fn is_running(&self) -> bool {
+        true
+    }
+}
diff --git a/golem-test-framework/src/components/worker_service/cloud.rs b/golem-test-framework/src/components/worker_service/cloud.rs
new file mode 100644
index 0000000000..ceb60f4fbe
--- /dev/null
+++ b/golem-test-framework/src/components/worker_service/cloud.rs
@@ -0,0 +1,113 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::components::registry_service::cloud::new_cloud_reqwest_client;
+use crate::components::worker_service::WorkerService;
+use async_trait::async_trait;
+use golem_client::api::{AgentClientLive, WorkerClientLive};
+use golem_client::{Context, Security};
+use golem_common::model::auth::TokenSecret;
+use tokio::sync::OnceCell;
+use tracing::info;
+use url::Url;
+
+/// Worker-service client for cloud mode.
+///
+/// In the deployed Golem environment both registry-service and worker-service
+/// are reachable behind a single Gateway API hostname
+/// (e.g. `https://release.dev-api.golem.cloud`). This struct holds that shared
+/// `api_url`; routing to worker-service is done by the Gateway based on URL
+/// path (`/v1/components/*/workers/**`, `/v1/agents/**`).
+pub struct CloudWorkerService {
+    api_url: Url,
+    base_http_client: OnceCell<reqwest_middleware::ClientWithMiddleware>,
+}
+
+impl CloudWorkerService {
+    pub fn new(api_url: Url) -> Self {
+        info!("Using cloud worker-service via API gateway at {api_url}");
+        Self {
+            api_url,
+            base_http_client: OnceCell::new(),
+        }
+    }
+}
+
+#[async_trait]
+impl WorkerService for CloudWorkerService {
+    fn http_host(&self) -> String {
+        self.api_url.host_str().unwrap_or("localhost").to_string()
+    }
+
+    fn http_port(&self) -> u16 {
+        self.api_url.port_or_known_default().unwrap_or(443)
+    }
+
+    fn grpc_host(&self) -> String {
+        panic!("grpc_host() is not available through the Gateway in cloud mode");
+    }
+
+    fn gprc_port(&self) -> u16 {
+        panic!("gprc_port() is not available through the Gateway in cloud mode");
+    }
+
+    fn custom_request_host(&self) -> String {
+        // Code-first HTTP API deployments are reached via the apps base domain
+        // (*.apps.dev.golem.cloud), not through this host.
+        panic!("custom_request_host() is not available in cloud mode");
+    }
+
+    fn custom_request_port(&self) -> u16 {
+        // Code-first HTTP API deployments are reached via the apps base domain
+        // (*.apps.dev.golem.cloud), not through this port.
+        panic!("custom_request_port() is not available in cloud mode");
+    }
+
+    fn mcp_port(&self) -> u16 {
+        panic!("mcp_port() is not available in cloud mode");
+    }
+
+    async fn kill(&self) {}
+
+    async fn base_http_client(&self) -> reqwest_middleware::ClientWithMiddleware {
+        self.base_http_client
+            .get_or_init(|| async { new_cloud_reqwest_client() })
+            .await
+            .clone()
+    }
+
+    /// Overrides the trait default to use the configured API gateway URL
+    /// (including scheme/TLS), rather than rebuilding `http://{host}:{port}`.
+    async fn worker_http_client(&self, token: &TokenSecret) -> WorkerClientLive {
+        WorkerClientLive {
+            context: Context {
+                client: self.base_http_client().await,
+                base_url: self.api_url.clone(),
+                security_token: Security::Bearer(token.secret().to_string()),
+            },
+        }
+    }
+
+    /// Overrides the trait default to use the configured API gateway URL
+    /// (including scheme/TLS), rather than rebuilding `http://{host}:{port}`.
+    async fn agent_http_client(&self, token: &TokenSecret) -> AgentClientLive {
+        AgentClientLive {
+            context: Context {
+                client: self.base_http_client().await,
+                base_url: self.api_url.clone(),
+                security_token: Security::Bearer(token.secret().to_string()),
+            },
+        }
+    }
+}
diff --git a/golem-test-framework/src/components/worker_service/mod.rs b/golem-test-framework/src/components/worker_service/mod.rs
index 6885e86696..126cc988c9 100644
--- a/golem-test-framework/src/components/worker_service/mod.rs
+++ b/golem-test-framework/src/components/worker_service/mod.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+pub mod cloud;
 pub mod provided;
 pub mod spawned;
 
diff --git a/golem-test-framework/src/config/benchmark.rs b/golem-test-framework/src/config/benchmark.rs
index a1a304436b..e97a58c47d 100644
--- a/golem-test-framework/src/config/benchmark.rs
+++ b/golem-test-framework/src/config/benchmark.rs
@@ -16,15 +16,21 @@ use crate::benchmark::BenchmarkConfig;
 use crate::components::component_compilation_service::ComponentCompilationService;
 use crate::components::component_compilation_service::provided::ProvidedComponentCompilationService;
 use crate::components::component_compilation_service::spawned::SpawnedComponentCompilationService;
+use crate::components::component_compilation_service::unavailable::UnavailableComponentCompilationService;
+use crate::components::rdb::PostgresInfo;
+use crate::components::rdb::Rdb;
 use crate::components::rdb::docker_postgres::DockerPostgresRdb;
 use crate::components::rdb::provided_postgres::ProvidedPostgresRdb;
-use crate::components::rdb::{PostgresInfo, Rdb};
+use crate::components::rdb::unavailable::UnavailableRdb;
 use crate::components::redis::Redis;
 use crate::components::redis::provided::ProvidedRedis;
 use crate::components::redis::spawned::SpawnedRedis;
+use crate::components::redis::unavailable::UnavailableRedis;
 use crate::components::redis_monitor::RedisMonitor;
 use crate::components::redis_monitor::spawned::SpawnedRedisMonitor;
+use crate::components::redis_monitor::unavailable::UnavailableRedisMonitor;
 use crate::components::registry_service::RegistryService;
+use crate::components::registry_service::cloud::CloudRegistryService;
 use crate::components::registry_service::provided::ProvidedRegistryService;
 use crate::components::registry_service::spawned::SpawnedRegistryService;
 use crate::components::service::Service;
@@ -32,10 +38,13 @@ use crate::components::service::spawned::SpawnedService;
 use crate::components::shard_manager::ShardManager;
 use crate::components::shard_manager::provided::ProvidedShardManager;
 use crate::components::shard_manager::spawned::SpawnedShardManager;
+use crate::components::shard_manager::unavailable::UnavailableShardManager;
 use crate::components::worker_executor_cluster::WorkerExecutorCluster;
 use crate::components::worker_executor_cluster::provided::ProvidedWorkerExecutorCluster;
 use crate::components::worker_executor_cluster::spawned::SpawnedWorkerExecutorCluster;
+use crate::components::worker_executor_cluster::unavailable::UnavailableWorkerExecutorCluster;
 use crate::components::worker_service::WorkerService;
+use crate::components::worker_service::cloud::CloudWorkerService;
 use crate::components::worker_service::provided::ProvidedWorkerService;
 use crate::components::worker_service::spawned::SpawnedWorkerService;
 use crate::config::TestDependencies;
@@ -51,11 +60,24 @@ use golem_service_base::storage::blob::BlobStorage;
 use golem_service_base::storage::blob::fs::FileSystemBlobStorage;
 use std::collections::HashMap;
 use std::path::{Path, PathBuf};
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
 use tempfile::TempDir;
 use tracing::Level;
+use url::Url;
 use uuid::Uuid;
 
+/// Process-level UUID generated on the first cloud-mode benchmark context
+/// creation. All cloud contexts within the same binary invocation share this
+/// run-id, which is used to prefix account/app/env names
+/// (`bench-{run_id}-…`) and written into result JSON metadata.
+static CLOUD_BENCH_RUN_ID: OnceLock<Uuid> = OnceLock::new();
+
+/// Returns the suite-level run-id if any cloud benchmark context has been
+/// created in this process, `None` otherwise.
+pub fn cloud_bench_run_id() -> Option<Uuid> {
+    CLOUD_BENCH_RUN_ID.get().copied()
+}
+
 /// Test dependencies created from command line arguments
 ///
 /// To be used when a single executable with an async entry point requires
@@ -75,6 +97,12 @@ pub struct BenchmarkTestDependencies {
     component_directory: PathBuf,
     component_temp_directory: Arc<TempDir>,
     registry_service: Arc<dyn RegistryService>,
+    /// Set to `Some` in cloud mode. Used to prefix account/app/env names with
+    /// `bench-{run_id}-` so that orphaned state is traceable.
+    run_id: Option<Uuid>,
+    /// The apps base domain for cloud mode (e.g. `apps.golem.cloud`). Used to
+    /// construct HTTP API deployment domains as `{env_id}.{apps_base_domain}`.
+    apps_base_domain: Option<String>,
 }
 
 #[derive(Parser, Debug, Clone)]
@@ -222,6 +250,54 @@ pub enum TestMode {
         #[arg(long, default_value = "test-components")]
         component_directory: String,
     },
+    /// Cloud mode: run benchmarks against a deployed Golem environment via
+    /// Gateway-API hostnames. No local service processes are spawned.
+    ///
+    /// All management API calls (registry-service, worker-service, agents) go
+    /// through a single Gateway hostname (`--api-url`). HTTP API deployment
+    /// access (code-first HTTP APIs) goes through `{env_id}.{apps_base_domain}`.
+    ///
+    /// For `golem-dev`:
+    ///   `--api-url https://release.dev-api.golem.cloud`
+    ///   `--apps-base-domain apps.dev.golem.cloud`
+    #[command()]
+    Cloud {
+        /// Base URL of the deployed Golem API Gateway. Both registry-service
+        /// and worker-service paths are routed internally by the Gateway.
+        ///
+        /// For the `golem-dev` environment this is
+        /// `https://release.dev-api.golem.cloud`.
+        #[arg(long)]
+        api_url: Url,
+        /// Wildcard base domain used to build per-environment HTTP API
+        /// deployment hostnames: `{env_id}.{apps_base_domain}`.
+        ///
+        /// For the `golem-dev` environment this is `apps.dev.golem.cloud`.
+        #[arg(long)]
+        apps_base_domain: String,
+        /// Bearer token for the admin account. Used to create a fresh user
+        /// account for each benchmark run, which then owns all benchmark state.
+        #[arg(long)]
+        admin_account_token: String,
+        /// UUID of the builtin-plugin-owner account.
+        #[arg(long)]
+        builtin_plugin_owner_account_id: Uuid,
+        /// UUID of the default plan on the target cluster.
+        #[arg(long)]
+        default_plan_id: Uuid,
+        /// Optional shard-manager gRPC hostname for a kubectl port-forward
+        /// (e.g. `localhost`). When set together with
+        /// `--shard-manager-grpc-port`, the throughput benchmark fetches the
+        /// routing table and labels RPC pairs as local/remote.
+        #[arg(long)]
+        shard_manager_grpc_host: Option<String>,
+        /// Optional shard-manager gRPC port (e.g. `9090`).
+        #[arg(long)]
+        shard_manager_grpc_port: Option<u16>,
+        /// Directory containing test WASM component files.
+        #[arg(long, default_value = "test-components")]
+        component_directory: String,
+    },
 }
 
 impl BenchmarkTestDependencies {
@@ -419,6 +495,8 @@ impl BenchmarkTestDependencies {
             initial_agent_files_service,
             component_temp_directory: Arc::new(TempDir::new().unwrap()),
             registry_service,
+            run_id: None,
+            apps_base_domain: None,
         }
     }
 
@@ -542,6 +620,8 @@ impl BenchmarkTestDependencies {
                     initial_agent_files_service,
                     component_temp_directory: Arc::new(TempDir::new().unwrap()),
                     registry_service,
+                    run_id: None,
+                    apps_base_domain: None,
                 }
             }
             TestMode::Spawned {
@@ -590,17 +670,93 @@ impl BenchmarkTestDependencies {
                 )
                 .await
             }
+            TestMode::Cloud {
+                api_url,
+                apps_base_domain,
+                admin_account_token,
+                builtin_plugin_owner_account_id,
+                default_plan_id,
+                shard_manager_grpc_host,
+                shard_manager_grpc_port,
+                component_directory,
+            } => {
+                let blob_storage = Arc::new(
+                    FileSystemBlobStorage::new(
+                        &std::env::temp_dir().join("golem-bench-blob-storage"),
+                    )
+                    .await
+                    .unwrap(),
+                );
+                let initial_agent_files_service =
+                    Arc::new(InitialAgentFilesService::new(blob_storage.clone()));
+
+                // Use the process-level run_id (shared across all cloud contexts in
+                // this process so all benchmarks in a suite carry the same run ID).
+                let run_id = *CLOUD_BENCH_RUN_ID.get_or_init(Uuid::new_v4);
+                tracing::info!("Cloud benchmark run_id: {run_id}");
+
+                // Both registry-service and worker-service are reachable via the
+                // same Gateway hostname; routing is path-based.
+                let registry_service: Arc<dyn RegistryService> =
+                    Arc::new(CloudRegistryService::new(
+                        api_url.clone(),
+                        TokenSecret::trusted(admin_account_token.clone()),
+                        AccountId(*builtin_plugin_owner_account_id),
+                        PlanId(*default_plan_id),
+                    ));
+
+                let shard_manager: Arc<dyn ShardManager> =
+                    match (shard_manager_grpc_host, shard_manager_grpc_port) {
+                        (Some(host), Some(port)) => {
+                            Arc::new(ProvidedShardManager::new(host.clone(), 0, *port))
+                        }
+                        _ => Arc::new(UnavailableShardManager),
+                    };
+
+                let worker_service: Arc<dyn WorkerService> =
+                    Arc::new(CloudWorkerService::new(api_url.clone()));
+
+                Self {
+                    rdb: Arc::new(UnavailableRdb),
+                    redis: Arc::new(UnavailableRedis),
+                    redis_monitor: Arc::new(UnavailableRedisMonitor),
+                    shard_manager,
+                    component_compilation_service: Arc::new(UnavailableComponentCompilationService),
+                    worker_service,
+                    worker_executor_cluster: Arc::new(UnavailableWorkerExecutorCluster),
+                    component_directory: Path::new(component_directory).to_path_buf(),
+                    blob_storage,
+                    initial_agent_files_service,
+                    component_temp_directory: Arc::new(TempDir::new().unwrap()),
+                    registry_service,
+                    run_id: Some(run_id),
+                    apps_base_domain: Some(apps_base_domain.clone()),
+                }
+            }
         }
     }
 
-    /// Checks if all the spawned dependencies are still running, and if not, panicks
+    /// Checks if all the spawned dependencies are still running, and if not, panics.
     ///
     /// This can be used as a checkpoint in benchmarks to avoid infinite retries.
+    /// In cloud mode this is a no-op — the cloud cluster is assumed to be
+    /// managed externally.
     pub async fn ensure_all_deps_running(&self) {
         if !self.worker_executor_cluster.is_running().await {
             panic!("Worker executor process(es) stopped");
         }
     }
+
+    /// Returns the run-id for this benchmark context, if running in cloud mode.
+    /// Used to prefix accounts/apps/envs with `bench-{run_id}-`.
+    pub fn run_id(&self) -> Option<Uuid> {
+        self.run_id
+    }
+
+    /// Returns the apps base domain for cloud mode (e.g. `apps.golem.cloud`).
+    pub fn apps_base_domain(&self) -> Option<&str> {
+        self.apps_base_domain.as_deref()
+    }
 }
 
 #[async_trait]
@@ -652,6 +808,10 @@ impl TestDependencies for BenchmarkTestDependencies {
     fn registry_service(&self) -> Arc<dyn RegistryService> {
         self.registry_service.clone()
     }
+
+    fn bench_name_prefix(&self) -> Option<String> {
+        self.run_id.map(|id| format!("bench-{id}-"))
+    }
 }
 
 #[allow(dead_code)]
diff --git a/golem-test-framework/src/config/dsl_impl.rs b/golem-test-framework/src/config/dsl_impl.rs
index b228a5235e..f2d5472175 100644
--- a/golem-test-framework/src/config/dsl_impl.rs
+++ b/golem-test-framework/src/config/dsl_impl.rs
@@ -883,8 +883,9 @@ impl<Deps: TestDependencies> TestDslExtended for TestUserContext<Deps> {
         environment_options: &EnvironmentOptions,
     ) -> anyhow::Result<(Application, Environment)> {
         let client = self.registry_service_client().await;
-        let app_name = ApplicationName(format!("app-{}", Uuid::new_v4()));
-        let env_name = EnvironmentName(format!("env-{}", Uuid::new_v4()));
+        let prefix = self.deps.bench_name_prefix().unwrap_or_default();
+        let app_name = ApplicationName(format!("{prefix}app-{}", Uuid::new_v4()));
+        let env_name = EnvironmentName(format!("{prefix}env-{}", Uuid::new_v4()));
 
         let application = client
             .create_application(
diff --git a/golem-test-framework/src/config/mod.rs b/golem-test-framework/src/config/mod.rs
index f5c14ace60..d8bdbe6b39 100644
--- a/golem-test-framework/src/config/mod.rs
+++ b/golem-test-framework/src/config/mod.rs
@@ -56,6 +56,13 @@ pub trait TestDependencies: Send + Sync + Clone {
     fn initial_agent_files_service(&self) -> Arc<InitialAgentFilesService>;
     fn registry_service(&self) -> Arc<dyn RegistryService>;
 
+    /// Returns an optional name prefix applied to benchmark-created accounts,
+    /// applications, and environments. Non-`None` in cloud mode, where the
+    /// prefix is `bench-{run_id}-` to make orphaned state traceable.
+    fn bench_name_prefix(&self) -> Option<String> {
+        None
+    }
+
     async fn admin(&self) -> TestUserContext<Self>
     where
         Self: Sized,
@@ -82,7 +89,12 @@ pub trait TestDependencies: Send + Sync + Clone {
             .client(&registry_service.admin_account_token())
             .await;
 
-        let name = Uuid::new_v4().to_string();
+        let uuid = Uuid::new_v4().to_string();
+        let name = if let Some(prefix) = self.bench_name_prefix() {
+            format!("{prefix}{uuid}")
+        } else {
+            uuid
+        };
         let account_data = AccountCreation {
             email: AccountEmail::new(format!("{name}@golem.cloud")),
             name,
diff --git a/integration-tests/src/benchmarks/all.rs b/integration-tests/src/benchmarks/all.rs
index 91d972534d..6865ecf6e4 100644
--- a/integration-tests/src/benchmarks/all.rs
+++ b/integration-tests/src/benchmarks/all.rs
@@ -13,16 +13,28 @@
 // limitations under the License.
 
 use clap::Parser;
+use golem_client::api::RegistryServiceClient;
+use golem_common::base_model::agent::ParsedAgentId;
+use golem_common::model::AgentId;
+use golem_common::model::application::{ApplicationCreation, ApplicationName};
+use golem_common::model::environment::{EnvironmentCreation, EnvironmentName};
+use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{
     Benchmark, BenchmarkApi, BenchmarkConfig, BenchmarkResult, BenchmarkSuite, BenchmarkSuiteItem,
     BenchmarkSuiteResult,
 };
-use golem_test_framework::config::benchmark::TestMode;
-use golem_test_framework::config::{BenchmarkCliParameters, BenchmarkTestDependencies};
+use golem_test_framework::config::benchmark::{TestMode, cloud_bench_run_id};
+use golem_test_framework::config::{
+    BenchmarkCliParameters, BenchmarkTestDependencies, TestDependencies,
+};
+use golem_test_framework::dsl::{TestDsl, TestDslExtended};
+use integration_tests::benchmarks::{
+    cleanup_account, cleanup_user_state, delete_workers, invoke_and_await_agent,
+};
 use std::collections::BTreeMap;
 use std::future::Future;
 use std::pin::Pin;
-use tracing::{Level, debug, info};
+use tracing::{Level, debug, info, warn};
 
 type RunFn = Box<
     dyn for<'a> Fn(
@@ -144,7 +156,14 @@ async fn main() {
                     length: length.clone(),
                     disable_compilation_cache: Some(*disable_compilation_cache),
                 };
-                let result = f(
+
+                cloud_preflight_warmup(
+                    params.benchmark_config.mode(),
+                    params.service_verbosity(),
+                    params.otlp,
+                )
+                .await;
+                let mut result = f(
                     params.benchmark_config.mode(),
                     params.service_verbosity(),
                     &item,
@@ -152,6 +171,10 @@ async fn main() {
                     params.otlp,
                 )
                 .await;
+                // Attach the run_id to result metadata (cloud mode only).
+                if let Some(run_id) = cloud_bench_run_id() {
+                    result.run_id = Some(format!("bench-{run_id}"));
+                }
                 if params.json {
                     let str = serde_json::to_string(&result)
                         .expect("Failed to serialize BenchmarkResult");
@@ -174,9 +197,27 @@ async fn main() {
             let suite: BenchmarkSuite =
                 serde_yaml::from_str(&raw_suite).expect("Failed to parse benchmark suite");
 
+            // Validate every benchmark name up-front so a typo exits immediately
+            // without running warmup or any prior benchmark.
+            for benchmark in &suite.benchmarks {
+                if !benchmarks_by_name.contains_key(benchmark.name.as_str()) {
+                    print_non_existing_benchmark(&mut benchmarks_by_name, &benchmark.name);
+                    // print_non_existing_benchmark calls std::process::exit(1)
+                    unreachable!();
+                }
+            }
+
+            // Pre-flight warmup runs after all names are validated.
+            cloud_preflight_warmup(
+                params.benchmark_config.mode(),
+                params.service_verbosity(),
+                params.otlp,
+            )
+            .await;
+
             let mut suite_result = BenchmarkSuiteResult::new(&suite.name);
             for benchmark in suite.benchmarks {
-                info!("Running {benchmark:?}"); // TODO
+                info!("Running {benchmark:?}");
 
                 if let Some(f) = benchmarks_by_name.get(benchmark.name.as_str()) {
                     let result = f(
@@ -188,9 +229,13 @@ async fn main() {
                     )
                     .await;
                     suite_result.add(result);
-                } else {
-                    print_non_existing_benchmark(&mut benchmarks_by_name, &benchmark.name);
                 }
+                // no else: we already validated all names above
+            }
+
+            // Attach the run_id to result metadata (cloud mode only).
+            if let Some(run_id) = cloud_bench_run_id() {
+                suite_result.run_id = Some(format!("bench-{run_id}"));
             }
 
             if let Some(path) = save_to_json {
@@ -241,3 +286,164 @@ async fn run_benchmark<B: Benchmark>(
 ) -> BenchmarkResult {
     B::run_benchmark(mode, verbosity, item, primary_only, otlp).await
 }
+
+// ── Pre-flight warmup constants ───────────────────────────────────────────────
+
+/// WASM file name (without `.wasm`) of the component used for warmup
+/// invocations.  Must be present in `--component-directory`.
+const WARMUP_COMPONENT_WASM: &str = "benchmark_agent_rust_release";
+/// Registry display name for the warmup component.
+const WARMUP_COMPONENT_NAME: &str = "benchmark:agent-rust";
+/// Agent type whose `echo` method is invoked during warmup.
+const WARMUP_AGENT_TYPE: &str = "RustBenchmarkAgent";
+/// Instance ID of the throwaway warmup agent.
+const WARMUP_AGENT_INSTANCE: &str = "warmup";
+/// Total wall-clock budget for the 50 warmup invocations.  If the budget
+/// fires (e.g. the platform is slow to cold-start on the first invocation)
+/// a warning is logged and the benchmark continues — warmup is best-effort.
+const WARMUP_BUDGET: std::time::Duration = std::time::Duration::from_secs(180);
+
+/// Pre-flight warmup for cloud mode. Runs once at suite/benchmark start;
+/// is a no-op for all non-cloud modes.
+///
+/// Executes 50 throwaway `invoke_and_await_agent` calls against a short-lived
+/// user/env/component. Each call exercises the full stack:
+/// gateway → registry-service (component lookup) → worker-service
+/// → worker-executor, warming NLB target-group routing and HTTP/2 sessions at
+/// every hop so they don't contaminate the first measured iteration.
+///
+/// The entire invocation phase is bounded by a 3-minute timeout. If the
+/// timeout fires (e.g. because of a gateway routing issue on the first cold
+/// start), a warning is logged and the benchmark continues — warm-up is
+/// best-effort.
+///
+/// If uploading the warmup component fails (e.g. the file is absent from the
+/// component directory), a warning is logged and the agent-invocation phase
+/// is skipped; the throwaway account is still cleaned up.
+async fn cloud_preflight_warmup(mode: &TestMode, verbosity: Level, otlp: bool) {
+    if !matches!(mode, TestMode::Cloud { .. }) {
+        return;
+    }
+
+    info!("Pre-flight warmup: creating throwaway user/env/component (50 invocations)...");
+
+    let deps = BenchmarkTestDependencies::new(mode, verbosity, 0, false, otlp).await;
+
+    let user = match deps.user().await {
+        Ok(u) => u,
+        Err(e) => {
+            warn!("Pre-flight warmup: failed to create user (skipping): {e:?}");
+            deps.kill_all().await;
+            return;
+        }
+    };
+
+    let registry_client = user.registry_service_client().await;
+    let prefix = user.deps.bench_name_prefix().unwrap_or_default();
+
+    let app = match registry_client
+        .create_application(
+            &user.account_id.0,
+            &ApplicationCreation {
+                name: ApplicationName(format!("{prefix}app-warmup")),
+            },
+        )
+        .await
+    {
+        Ok(a) => a,
+        Err(e) => {
+            warn!("Pre-flight warmup: failed to create app (skipping): {e:?}");
+            cleanup_account(&user).await;
+            deps.kill_all().await;
+            return;
+        }
+    };
+
+    let env = match registry_client
+        .create_environment(
+            &app.id.0,
+            &EnvironmentCreation {
+                name: EnvironmentName(format!("{prefix}env-warmup")),
+                compatibility_check: false,
+                version_check: false,
+                security_overrides: false,
+            },
+        )
+        .await
+    {
+        Ok(e) => e,
+        Err(e) => {
+            warn!("Pre-flight warmup: failed to create env (skipping): {e:?}");
+            // delete app explicitly before account (cascading delete is incomplete)
+            if let Err(del_err) = registry_client
+                .delete_application(&app.id.0, app.revision.into())
+                .await
+            {
+                warn!(
+                    "Pre-flight warmup: failed to delete app {} after env-creation \
+                     failure (best-effort, app may be orphaned): {del_err:?}",
+                    app.id.0
+                );
+            }
+            cleanup_account(&user).await;
+            deps.kill_all().await;
+            return;
+        }
+    };
+
+    let component = match user
+        .component(&env.id, WARMUP_COMPONENT_WASM)
+        .name(WARMUP_COMPONENT_NAME)
+        .store()
+        .await
+    {
+        Ok(c) => c,
+        Err(e) => {
+            warn!(
+                "Pre-flight warmup: failed to upload warmup component \
+                 ({WARMUP_COMPONENT_WASM}.wasm) — ensure it exists in the \
+                 component directory: {e:?}"
+            );
+            cleanup_user_state(&user, &env.id).await;
+            deps.kill_all().await;
+            return;
+        }
+    };
+
+    let warmup_agent: ParsedAgentId = agent_id!(WARMUP_AGENT_TYPE, WARMUP_AGENT_INSTANCE);
+
+    // Bound the 50 invocations with a total wall-clock budget.
+    let invoke_result = tokio::time::timeout(WARMUP_BUDGET, async {
+        for i in 0..50usize {
+            let result = invoke_and_await_agent(
+                &user,
+                &component,
+                &warmup_agent,
+                "echo",
+                data_value!("warmup"),
+            )
+            .await;
+            info!(
+                "Pre-flight warmup invocation {}/50: {}ms",
+                i + 1,
+                result.accumulated_time.as_millis()
+            );
+        }
+    })
+    .await;
+
+    if invoke_result.is_err() {
+        warn!(
+            "Pre-flight warmup: invocation phase timed out after {}s (continuing anyway)",
+            WARMUP_BUDGET.as_secs()
+        );
+    }
+
+    if let Ok(worker_id) = AgentId::from_agent_id(component.id, &warmup_agent) {
+        delete_workers(&user, &[worker_id]).await;
+    }
+    cleanup_user_state(&user, &env.id).await;
+    deps.kill_all().await;
+
+    info!("Cloud pre-flight warmup complete.");
+}
diff --git a/integration-tests/src/benchmarks/cleanup.rs b/integration-tests/src/benchmarks/cleanup.rs
new file mode 100644
index 0000000000..2047b06c4d
--- /dev/null
+++ b/integration-tests/src/benchmarks/cleanup.rs
@@ -0,0 +1,529 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Cleanup helpers for cloud-perf benchmarks.
+//!
+//! The [`CleanupClient`] trait is the narrow interface used by the cascading
+//! cleanup logic, which enables unit-testing with the [`MockCleanupClient`]
+//! below.
+
+use async_trait::async_trait;
+use golem_client::api::RegistryServiceClient;
+use golem_common::model::environment::EnvironmentId;
+use golem_test_framework::config::dsl_impl::TestUserContext;
+use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies};
+use tracing::warn;
+use uuid::Uuid;
+
+// ── Narrow trait ─────────────────────────────────────────────────────────────
+
+/// Narrow client interface covering only the operations used by the cascading
+/// cleanup helpers.  Use [`RegistryCleanupAdapter`] to wrap a real client and
+/// [`MockCleanupClient`] (in tests) to inject failures.
+#[async_trait]
+pub trait CleanupClient: Send + Sync {
+    /// Returns `(component_id, revision)` pairs for all components in the env.
+    async fn list_env_components(&self, env_id: &Uuid) -> anyhow::Result<Vec<(Uuid, u64)>>;
+    async fn delete_component(&self, id: &Uuid, revision: u64) -> anyhow::Result<()>;
+
+    /// Returns domain-registration IDs for the env.
+    async fn list_env_domain_registrations(&self, env_id: &Uuid) -> anyhow::Result<Vec<Uuid>>;
+    async fn delete_domain_registration(&self, id: &Uuid) -> anyhow::Result<()>;
+
+    /// Returns `(application_id, env_revision)` for the environment.
+    async fn get_env_app_id_and_revision(&self, env_id: &Uuid) -> anyhow::Result<(Uuid, u64)>;
+    async fn delete_environment(&self, env_id: &Uuid, revision: u64) -> anyhow::Result<()>;
+
+    /// Returns the application's current revision.
+    async fn get_application_revision(&self, app_id: &Uuid) -> anyhow::Result<u64>;
+    async fn delete_application(&self, app_id: &Uuid, revision: u64) -> anyhow::Result<()>;
+
+    /// Returns the account's current revision.
+    async fn get_account_revision(&self, account_id: &Uuid) -> anyhow::Result<u64>;
+    async fn delete_account(&self, account_id: &Uuid, revision: u64) -> anyhow::Result<()>;
+}
+
+// ── Real adapter ─────────────────────────────────────────────────────────────
+
+/// Wraps any `RegistryServiceClient` implementor and bridges it to
+/// [`CleanupClient`].
+pub struct RegistryCleanupAdapter<C> {
+    inner: C,
+}
+
+impl<C: RegistryServiceClient + Send + Sync> RegistryCleanupAdapter<C> {
+    pub fn new(inner: C) -> Self {
+        Self { inner }
+    }
+}
+
+#[async_trait]
+impl<C: RegistryServiceClient + Send + Sync> CleanupClient for RegistryCleanupAdapter<C> {
+    async fn list_env_components(&self, env_id: &Uuid) -> anyhow::Result<Vec<(Uuid, u64)>> {
+        let page = self
+            .inner
+            .list_environment_components(env_id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))?;
+        Ok(page
+            .values
+            .into_iter()
+            .map(|c| (c.id.0, c.revision.into()))
+            .collect())
+    }
+
+    async fn delete_component(&self, id: &Uuid, revision: u64) -> anyhow::Result<()> {
+        self.inner
+            .delete_component(id, revision)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))
+    }
+
+    async fn list_env_domain_registrations(&self, env_id: &Uuid) -> anyhow::Result<Vec<Uuid>> {
+        let page = self
+            .inner
+            .list_environment_domain_registrations(env_id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))?;
+        Ok(page.values.into_iter().map(|dr| dr.id.0).collect())
+    }
+
+    async fn delete_domain_registration(&self, id: &Uuid) -> anyhow::Result<()> {
+        self.inner
+            .delete_domain_registration(id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))
+    }
+
+    async fn get_env_app_id_and_revision(&self, env_id: &Uuid) -> anyhow::Result<(Uuid, u64)> {
+        let env = self
+            .inner
+            .get_environment(env_id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))?;
+        Ok((env.application_id.0, env.revision.into()))
+    }
+
+    async fn delete_environment(&self, env_id: &Uuid, revision: u64) -> anyhow::Result<()> {
+        self.inner
+            .delete_environment(env_id, revision)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))
+    }
+
+    async fn get_application_revision(&self, app_id: &Uuid) -> anyhow::Result<u64> {
+        let app = self
+            .inner
+            .get_application(app_id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))?;
+        Ok(app.revision.into())
+    }
+
+    async fn delete_application(&self, app_id: &Uuid, revision: u64) -> anyhow::Result<()> {
+        self.inner
+            .delete_application(app_id, revision)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))
+    }
+
+    async fn get_account_revision(&self, account_id: &Uuid) -> anyhow::Result<u64> {
+        let account = self
+            .inner
+            .get_account(account_id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))?;
+        Ok(account.revision.into())
+    }
+
+    async fn delete_account(&self, account_id: &Uuid, revision: u64) -> anyhow::Result<()> {
+        self.inner
+            .delete_account(account_id, revision)
+            .await
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!("{e:?}"))
+    }
+}
+
+// ── Core cleanup logic (testable via CleanupClient) ───────────────────────────
+
+/// Steps 1–4 of the cascading cleanup: components → domain registrations →
+/// environment → application.  Does **not** delete the account.
+///
+/// Every step is best-effort: failures are warned and cleanup continues.
+///
+/// **Note:** Server-side cascading delete is incomplete (golemcloud/golem#3291).
+pub async fn cleanup_env_and_app_with(client: &dyn CleanupClient, env_id: &Uuid) {
+    // Step 1: components
+    match client.list_env_components(env_id).await {
+        Ok(components) => {
+            for (cid, rev) in components {
+                if let Err(e) = client.delete_component(&cid, rev).await {
+                    warn!("cleanup: delete component {cid} failed (best-effort): {e:?}");
+                }
+            }
+        }
+        Err(e) => warn!("cleanup: list components for env {env_id} failed (best-effort): {e:?}"),
+    }
+
+    // Step 2: domain registrations
+    match client.list_env_domain_registrations(env_id).await {
+        Ok(ids) => {
+            for id in ids {
+                if let Err(e) = client.delete_domain_registration(&id).await {
+                    warn!("cleanup: delete domain registration {id} failed (best-effort): {e:?}");
+                }
+            }
+        }
+        Err(e) => {
+            warn!(
+                "cleanup: list domain registrations for env {env_id} failed \
+                 (best-effort): {e:?}"
+            )
+        }
+    }
+
+    // Step 3: environment (also captures app_id for step 4)
+    let app_id = match client.get_env_app_id_and_revision(env_id).await {
+        Ok((app_id, rev)) => {
+            if let Err(e) = client.delete_environment(env_id, rev).await {
+                warn!("cleanup: delete environment {env_id} failed (best-effort): {e:?}");
+            }
+            Some(app_id)
+        }
+        Err(e) => {
+            warn!("cleanup: get environment {env_id} failed (best-effort): {e:?}");
+            None
+        }
+    };
+
+    // Step 4: application (only when app_id is known from step 3)
+    if let Some(app_id) = app_id {
+        match client.get_application_revision(&app_id).await {
+            Ok(rev) => {
+                if let Err(e) = client.delete_application(&app_id, rev).await {
+                    warn!("cleanup: delete application {app_id} failed (best-effort): {e:?}");
+                }
+            }
+            Err(e) => {
+                warn!("cleanup: get application {app_id} failed (best-effort): {e:?}")
+            }
+        }
+    }
+}
+
+/// Step 5 of the cascading cleanup: deletes the user account.
+pub async fn cleanup_account_with(client: &dyn CleanupClient, account_id: &Uuid) {
+    match client.get_account_revision(account_id).await {
+        Ok(rev) => {
+            if let Err(e) = client.delete_account(account_id, rev).await {
+                warn!("cleanup: delete account {account_id} failed (best-effort): {e:?}");
+            }
+        }
+        Err(e) => {
+            warn!("cleanup: get account {account_id} failed (best-effort): {e:?}")
+        }
+    }
+}
+
+// ── High-level wrappers (take a TestUserContext) ──────────────────────────────
+
+/// Steps 1–4: components, domain registrations, environment, application.
+///
+/// For benchmarks whose iterations create one user with multiple envs/apps
+/// (e.g. cold-start-unknown), call this once per env then call
+/// [`cleanup_account`] once at the end.
+pub async fn cleanup_env_and_app(
+    user: &TestUserContext<BenchmarkTestDependencies>,
+    env_id: &EnvironmentId,
+) {
+    let client = user.deps.registry_service().client(&user.token).await;
+    let adapter = RegistryCleanupAdapter::new(client);
+    cleanup_env_and_app_with(&adapter, &env_id.0).await;
+}
+
+/// Step 5: deletes the user account.
+pub async fn cleanup_account(user: &TestUserContext<BenchmarkTestDependencies>) {
+    let client = user.deps.registry_service().client(&user.token).await;
+    let adapter = RegistryCleanupAdapter::new(client);
+    cleanup_account_with(&adapter, &user.account_id.0).await;
+}
+
+/// Convenience wrapper for the common single-env-per-user case:
+/// [`cleanup_env_and_app`] followed by [`cleanup_account`].
+pub async fn cleanup_user_state(
+    user: &TestUserContext<BenchmarkTestDependencies>,
+    env_id: &EnvironmentId,
+) {
+    cleanup_env_and_app(user, env_id).await;
+    cleanup_account(user).await;
+}
+
+// ── Unit tests ────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+pub mod tests {
+    use super::*;
+    use std::collections::HashSet;
+    use std::sync::{Arc, Mutex};
+    use test_r::test;
+
+    fn block_on<F: std::future::Future>(f: F) -> F::Output {
+        tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap()
+            .block_on(f)
+    }
+
+    /// In-process mock that records every operation attempted and fails the
+    /// operations listed in `fail_ops`.
+    pub struct MockCleanupClient {
+        fail_ops: HashSet<&'static str>,
+        /// Ordered log of every operation attempted.
+        pub calls: Arc<Mutex<Vec<&'static str>>>,
+        /// The `application_id` returned by `get_env_app_id_and_revision`
+        /// (used to verify step-4 precondition propagation in tests).
+        pub app_id: Uuid,
+    }
+
+    impl MockCleanupClient {
+        pub fn new(fail_ops: &[&'static str]) -> (Self, Arc<Mutex<Vec<&'static str>>>) {
+            let calls = Arc::new(Mutex::new(Vec::new()));
+            let mock = Self {
+                fail_ops: fail_ops.iter().copied().collect(),
+                calls: calls.clone(),
+                app_id: Uuid::new_v4(),
+            };
+            (mock, calls)
+        }
+
+        fn record(&self, name: &'static str) {
+            self.calls.lock().unwrap().push(name);
+        }
+
+        fn result(&self, name: &'static str) -> anyhow::Result<()> {
+            self.record(name);
+            if self.fail_ops.contains(name) {
+                Err(anyhow::anyhow!("simulated failure in {name}"))
+            } else {
+                Ok(())
+            }
+        }
+    }
+
+    #[async_trait]
+    impl CleanupClient for MockCleanupClient {
+        async fn list_env_components(&self, _: &Uuid) -> anyhow::Result<Vec<(Uuid, u64)>> {
+            self.record("list_env_components");
+            if self.fail_ops.contains("list_env_components") {
+                Err(anyhow::anyhow!("simulated failure"))
+            } else {
+                Ok(vec![(Uuid::new_v4(), 0)])
+            }
+        }
+
+        async fn delete_component(&self, _: &Uuid, _: u64) -> anyhow::Result<()> {
+            self.result("delete_component")
+        }
+
+        async fn list_env_domain_registrations(&self, _: &Uuid) -> anyhow::Result<Vec<Uuid>> {
+            self.record("list_env_domain_registrations");
+            if self.fail_ops.contains("list_env_domain_registrations") {
+                Err(anyhow::anyhow!("simulated failure"))
+            } else {
+                Ok(vec![Uuid::new_v4()])
+            }
+        }
+
+        async fn delete_domain_registration(&self, _: &Uuid) -> anyhow::Result<()> {
+            self.result("delete_domain_registration")
+        }
+
+        async fn get_env_app_id_and_revision(&self, _: &Uuid) -> anyhow::Result<(Uuid, u64)> {
+            self.record("get_env_app_id_and_revision");
+            if self.fail_ops.contains("get_env_app_id_and_revision") {
+                Err(anyhow::anyhow!("simulated failure"))
+            } else {
+                Ok((self.app_id, 1))
+            }
+        }
+
+        async fn delete_environment(&self, _: &Uuid, _: u64) -> anyhow::Result<()> {
+            self.result("delete_environment")
+        }
+
+        async fn get_application_revision(&self, _: &Uuid) -> anyhow::Result<u64> {
+            self.record("get_application_revision");
+            if self.fail_ops.contains("get_application_revision") {
+                Err(anyhow::anyhow!("simulated failure"))
+            } else {
+                Ok(1)
+            }
+        }
+
+        async fn delete_application(&self, _: &Uuid, _: u64) -> anyhow::Result<()> {
+            self.result("delete_application")
+        }
+
+        async fn get_account_revision(&self, _: &Uuid) -> anyhow::Result<u64> {
+            self.record("get_account_revision");
+            if self.fail_ops.contains("get_account_revision") {
+                Err(anyhow::anyhow!("simulated failure"))
+            } else {
+                Ok(1)
+            }
+        }
+
+        async fn delete_account(&self, _: &Uuid, _: u64) -> anyhow::Result<()> {
+            self.result("delete_account")
+        }
+    }
+
+    // ── Test helpers ──────────────────────────────────────────────────────────
+
+    fn all_ops() -> Vec<&'static str> {
+        vec![
+            "list_env_components",
+            "delete_component",
+            "list_env_domain_registrations",
+            "delete_domain_registration",
+            "get_env_app_id_and_revision",
+            "delete_environment",
+            "get_application_revision",
+            "delete_application",
+            "get_account_revision",
+            "delete_account",
+        ]
+    }
+
+    fn run(mock: &MockCleanupClient) {
+        let env_id = Uuid::new_v4();
+        let account_id = Uuid::new_v4();
+        block_on(async {
+            cleanup_env_and_app_with(mock, &env_id).await;
+            cleanup_account_with(mock, &account_id).await;
+        });
+    }
+
+    fn contains(calls: &[&str], op: &str) -> bool {
+        calls.contains(&op)
+    }
+
+    // ── Tests ─────────────────────────────────────────────────────────────────
+
+    #[test]
+    fn all_steps_run_on_success() {
+        let (mock, calls) = MockCleanupClient::new(&[]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        for op in all_ops() {
+            assert!(
+                contains(&calls, op),
+                "expected '{op}' to be called; got: {calls:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn step1_list_failure_continues() {
+        let (mock, calls) = MockCleanupClient::new(&["list_env_components"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(
+            contains(&calls, "list_env_domain_registrations"),
+            "{calls:?}"
+        );
+        assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}");
+        assert!(contains(&calls, "get_account_revision"), "{calls:?}");
+    }
+
+    #[test]
+    fn step2_list_failure_continues() {
+        let (mock, calls) = MockCleanupClient::new(&["list_env_domain_registrations"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}");
+        assert!(contains(&calls, "get_account_revision"), "{calls:?}");
+    }
+
+    /// `get_env_app_id_and_revision` (step 3 get) fails → step 4 is skipped
+    /// (no app_id available) but step 5 still runs.
+    #[test]
+    fn step3_get_failure_skips_step4_runs_step5() {
+        let (mock, calls) = MockCleanupClient::new(&["get_env_app_id_and_revision"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(
+            !contains(&calls, "get_application_revision"),
+            "step 4 must be skipped when step 3 get fails; got: {calls:?}"
+        );
+        assert!(
+            contains(&calls, "get_account_revision"),
+            "step 5 must still run; got: {calls:?}"
+        );
+    }
+
+    /// `delete_environment` fails but get succeeded, so app_id is available:
+    /// step 4 and step 5 both run.
+    #[test]
+    fn step3_delete_failure_still_runs_step4_and_step5() {
+        let (mock, calls) = MockCleanupClient::new(&["delete_environment"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(contains(&calls, "get_application_revision"), "{calls:?}");
+        assert!(contains(&calls, "get_account_revision"), "{calls:?}");
+    }
+
+    #[test]
+    fn step4_failure_continues_to_step5() {
+        let (mock, calls) = MockCleanupClient::new(&["get_application_revision"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(
+            contains(&calls, "get_account_revision"),
+            "step 5 should run after step 4 failure; got: {calls:?}"
+        );
+    }
+
+    /// `get_account_revision` (step 5 get) fails → function completes without
+    /// panic and `delete_account` is not attempted.
+    #[test]
+    fn step5_get_failure_no_delete_and_completes() {
+        let (mock, calls) = MockCleanupClient::new(&["get_account_revision"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(contains(&calls, "get_account_revision"), "{calls:?}");
+        assert!(
+            !contains(&calls, "delete_account"),
+            "delete_account must not run when get fails; got: {calls:?}"
+        );
+    }
+
+    /// All steps fail simultaneously — function completes without panic and
+    /// every unconditional step is attempted.
+    #[test]
+    fn all_steps_fail_no_short_circuit() {
+        let (mock, calls) = MockCleanupClient::new(&all_ops());
+        run(&mock); // must not panic
+        let calls = calls.lock().unwrap().clone();
+        assert!(contains(&calls, "list_env_components"), "{calls:?}");
+        assert!(
+            contains(&calls, "list_env_domain_registrations"),
+            "{calls:?}"
+        );
+        assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}");
+        assert!(contains(&calls, "get_account_revision"), "{calls:?}");
+    }
+}
diff --git a/integration-tests/src/benchmarks/cold_start_unknown.rs b/integration-tests/src/benchmarks/cold_start_unknown.rs
index f29f297658..592b80e2e4 100644
--- a/integration-tests/src/benchmarks/cold_start_unknown.rs
+++ b/integration-tests/src/benchmarks/cold_start_unknown.rs
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::benchmarks::{delete_workers, invoke_and_await_agent};
+use crate::benchmarks::{
+    cleanup_account, cleanup_env_and_app, delete_workers, invoke_and_await_agent,
+};
 use async_trait::async_trait;
 use futures_concurrency::future::Join;
 use golem_common::base_model::agent::ParsedAgentId;
 use golem_common::model::AgentId;
 use golem_common::model::component::ComponentDto;
+use golem_common::model::environment::EnvironmentId;
 use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig};
 use golem_test_framework::config::benchmark::TestMode;
@@ -196,6 +199,8 @@ impl Benchmark for ColdStartUnknownMedium {
 pub struct IterationContext {
     user: TestUserContext<BenchmarkTestDependencies>,
     agents: Vec<(ComponentDto, ParsedAgentId)>,
+    /// One env_id per size (cold_start creates one env per component).
+    env_ids: Vec<EnvironmentId>,
 }
 
 pub struct ColdStartUnknownBenchmark {
@@ -235,11 +240,13 @@ impl ColdStartUnknownBenchmark {
     pub async fn setup_iteration(&self, config: &RunConfig) -> IterationContext {
         let user = self.deps.user().await.unwrap();
         let mut agents = vec![];
+        let mut env_ids = vec![];
 
         for _ in 0..config.size {
             // Agent types names are unique within one environment,
             // so make sure each component get its own env
             let (_, env) = user.app_and_env().await.unwrap();
+            env_ids.push(env.id);
 
             let component = user
                 .component(&env.id, &self.component_name)
@@ -252,7 +259,11 @@ impl ColdStartUnknownBenchmark {
             agents.push((component, agent_id));
         }
 
-        IterationContext { user, agents }
+        IterationContext {
+            user,
+            agents,
+            env_ids,
+        }
     }
 
     pub async fn warmup(&self, config: &RunConfig) {
@@ -298,6 +309,14 @@ impl ColdStartUnknownBenchmark {
             .iter()
             .filter_map(|(component, agent_id)| AgentId::from_agent_id(component.id, agent_id).ok())
             .collect();
-        delete_workers(&iteration.user, &agent_ids).await
+        delete_workers(&iteration.user, &agent_ids).await;
+        // Clean up each env/app individually, then delete the account once.
+        // This avoids the account being deleted on the first env cleanup and
+        // causing subsequent cleanup calls to fail (since the user token would
+        // be invalid after account deletion).
+        for env_id in &iteration.env_ids {
+            cleanup_env_and_app(&iteration.user, env_id).await;
+        }
+        cleanup_account(&iteration.user).await;
     }
 }
diff --git a/integration-tests/src/benchmarks/durability_overhead.rs b/integration-tests/src/benchmarks/durability_overhead.rs
index f956eb3636..fb864fd44c 100644
--- a/integration-tests/src/benchmarks/durability_overhead.rs
+++ b/integration-tests/src/benchmarks/durability_overhead.rs
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::benchmarks::{delete_workers, invoke_and_await_agent};
+use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent};
 use async_trait::async_trait;
 use futures_concurrency::future::Join;
 use golem_common::base_model::agent::ParsedAgentId;
 use golem_common::model::AgentId;
 use golem_common::model::component::{ComponentDto, ComponentId};
+use golem_common::model::environment::EnvironmentId;
 use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig};
 use golem_test_framework::config::benchmark::TestMode;
@@ -42,6 +43,7 @@ pub struct DurabilityOverheadIterationContext {
     durable_nonpersistent_agent_ids: Vec<ParsedAgentId>,
     ephemeral_agent_ids: Vec<ParsedAgentId>,
     durable_persistent_commit_agent_ids: Vec<ParsedAgentId>,
+    env_id: EnvironmentId,
 }
 
 fn agent_ids_to_agent_ids(component_id: ComponentId, agent_ids: &[ParsedAgentId]) -> Vec<AgentId> {
@@ -146,6 +148,7 @@ impl Benchmark for DurabilityOverhead {
             durable_nonpersistent_agent_ids,
             ephemeral_agent_ids,
             durable_persistent_commit_agent_ids,
+            env_id: env.id,
         }
     }
 
@@ -336,5 +339,6 @@ impl Benchmark for DurabilityOverhead {
             ),
         )
         .await;
+        cleanup_user_state(&context.user, &context.env_id).await;
     }
 }
diff --git a/integration-tests/src/benchmarks/latency.rs b/integration-tests/src/benchmarks/latency.rs
index a44ff42333..006d29f228 100644
--- a/integration-tests/src/benchmarks/latency.rs
+++ b/integration-tests/src/benchmarks/latency.rs
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::benchmarks::{delete_workers, invoke_and_await_agent};
+use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent};
 use async_trait::async_trait;
 use futures_concurrency::future::Join;
 use golem_common::base_model::agent::ParsedAgentId;
 use golem_common::model::AgentId;
 use golem_common::model::component::ComponentDto;
+use golem_common::model::environment::EnvironmentId;
 use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig};
 use golem_test_framework::config::benchmark::TestMode;
@@ -200,6 +201,7 @@ pub struct IterationContext {
     component: ComponentDto,
     agent_ids: Vec<ParsedAgentId>,
     length: usize,
+    env_id: EnvironmentId,
 }
 
 pub struct LatencyBenchmark {
@@ -261,6 +263,7 @@ impl LatencyBenchmark {
             component,
             agent_ids,
             length: config.length,
+            env_id: env.id,
         }
     }
 
@@ -326,6 +329,7 @@ impl LatencyBenchmark {
             .iter()
             .filter_map(|agent_id| AgentId::from_agent_id(iteration.component.id, agent_id).ok())
             .collect();
-        delete_workers(&iteration.user, &agent_ids).await
+        delete_workers(&iteration.user, &agent_ids).await;
+        cleanup_user_state(&iteration.user, &iteration.env_id).await;
     }
 }
diff --git a/integration-tests/src/benchmarks/mod.rs b/integration-tests/src/benchmarks/mod.rs
index b15dde89a3..d1651f063f 100644
--- a/integration-tests/src/benchmarks/mod.rs
+++ b/integration-tests/src/benchmarks/mod.rs
@@ -29,15 +29,19 @@ use std::time::{Duration, SystemTime};
 use tracing::{Instrument, info, warn};
 use tracing_opentelemetry::OpenTelemetrySpanExt;
 
+pub mod cleanup;
 pub mod cold_start_unknown;
 pub mod durability_overhead;
 pub mod latency;
 pub mod sleep;
 pub mod throughput;
 
-/// Injects the current tracing span's OpenTelemetry trace context (traceparent/tracestate)
-/// into a reqwest Request's headers so that downstream services can link their
-/// spans to the benchmark's trace.
+// Re-export cleanup helpers so callers can use the flat `benchmarks::*` path.
+pub use cleanup::{cleanup_account, cleanup_env_and_app, cleanup_user_state};
+
+/// Injects the current tracing span's OpenTelemetry trace context
+/// (traceparent/tracestate) into a reqwest Request's headers so that
+/// downstream services can link their spans to the benchmark's trace.
 fn inject_trace_context(request: &mut Request) {
     let current_span = tracing::Span::current();
     let otel_context = current_span.context();
diff --git a/integration-tests/src/benchmarks/sleep.rs b/integration-tests/src/benchmarks/sleep.rs
index 97bb64e16f..457872ed29 100644
--- a/integration-tests/src/benchmarks/sleep.rs
+++ b/integration-tests/src/benchmarks/sleep.rs
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::benchmarks::delete_workers;
+use crate::benchmarks::{cleanup_user_state, delete_workers};
 use async_trait::async_trait;
 use futures_concurrency::future::Join;
 use golem_common::base_model::agent::ParsedAgentId;
 use golem_common::model::AgentId;
 use golem_common::model::component::ComponentDto;
+use golem_common::model::environment::EnvironmentId;
 use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig};
 use golem_test_framework::config::benchmark::TestMode;
@@ -39,6 +40,7 @@ pub struct SleepIterationContext {
     user: TestUserContext<BenchmarkTestDependencies>,
     component: ComponentDto,
     agent_ids: Vec<ParsedAgentId>,
+    env_id: EnvironmentId,
 }
 
 #[async_trait]
@@ -111,6 +113,7 @@ impl Benchmark for Sleep {
             user,
             component,
             agent_ids,
+            env_id: env.id,
         }
     }
 
@@ -184,6 +187,7 @@ impl Benchmark for Sleep {
             .iter()
             .filter_map(|agent_id| AgentId::from_agent_id(context.component.id, agent_id).ok())
             .collect();
-        delete_workers(&context.user, &agent_ids).await
+        delete_workers(&context.user, &agent_ids).await;
+        cleanup_user_state(&context.user, &context.env_id).await;
     }
 }
diff --git a/integration-tests/src/benchmarks/throughput.rs b/integration-tests/src/benchmarks/throughput.rs
index 9cdecd7a1f..5515090847 100644
--- a/integration-tests/src/benchmarks/throughput.rs
+++ b/integration-tests/src/benchmarks/throughput.rs
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::benchmarks::{delete_workers, invoke_and_await_agent, invoke_and_await_http};
+use crate::benchmarks::{
+    cleanup_user_state, delete_workers, invoke_and_await_agent, invoke_and_await_http,
+};
 use async_trait::async_trait;
 use axum::http::{HeaderMap, HeaderValue};
 use futures_concurrency::future::Join;
@@ -21,6 +23,7 @@ use golem_common::base_model::agent::{DataValue, ParsedAgentId};
 use golem_common::model::agent::AgentTypeName;
 use golem_common::model::component::{ComponentDto, ComponentId};
 use golem_common::model::domain_registration::{Domain, DomainRegistrationCreation};
+use golem_common::model::environment::EnvironmentId;
 use golem_common::model::http_api_deployment::{
     HttpApiDeploymentAgentOptions, HttpApiDeploymentCreation,
 };
@@ -79,16 +82,14 @@ impl Benchmark for ThroughputEcho {
             "echo",
             "echo",
             Box::new(|_| data_value!("benchmark")),
-            Box::new(|port, idx, _length| {
-                let url = Url::parse(&format!(
-                    "http://localhost:{port}/test-{idx}-http/echo/test-message"
-                ))
-                .unwrap();
+            Box::new(|base_url, idx, _length| {
+                let url =
+                    Url::parse(&format!("{base_url}/test-{idx}-http/echo/test-message")).unwrap();
                 Request::new(Method::POST, url)
             }),
-            Box::new(|port, idx, _length| {
+            Box::new(|base_url, idx, _length| {
                 let url = Url::parse(&format!(
-                    "http://localhost:{port}/rust/test-{idx}-http/echo/test-message"
+                    "{base_url}/rust/test-{idx}-http/echo/test-message"
                 ))
                 .unwrap();
                 Request::new(Method::POST, url)
@@ -179,21 +180,16 @@ impl Benchmark for ThroughputLargeInput {
                 let bytes = vec![0u8; length];
                 data_value!(bytes)
             }),
-            Box::new(|port, idx, length| {
-                let url = Url::parse(&format!(
-                    "http://localhost:{port}/test-{idx}-http/large-input"
-                ))
-                .unwrap();
+            Box::new(|base_url, idx, length| {
+                let url = Url::parse(&format!("{base_url}/test-{idx}-http/large-input")).unwrap();
                 let json_body = json!({"input": vec![0u8; length]}).to_string();
                 let mut request = Request::new(Method::POST, url);
                 *request.body_mut() = Some(Body::wrap(json_body));
                 request
             }),
-            Box::new(|port, idx, length| {
-                let url = Url::parse(&format!(
-                    "http://localhost:{port}/rust/test-{idx}-http/large-input"
-                ))
-                .unwrap();
+            Box::new(|base_url, idx, length| {
+                let url =
+                    Url::parse(&format!("{base_url}/rust/test-{idx}-http/large-input")).unwrap();
                 let json_body = json!({"input": vec![0u8; length]}).to_string();
                 let mut request = Request::new(Method::POST, url);
                 *request.body_mut() = Some(Body::wrap(json_body));
@@ -282,21 +278,16 @@ impl Benchmark for ThroughputCpuIntensive {
             "cpu_intensive",
             "cpuIntensive",
             Box::new(|length| data_value!(length as f64)),
-            Box::new(|port, idx, length| {
-                let url = Url::parse(&format!(
-                    "http://localhost:{port}/test-{idx}-http/cpu-intensive"
-                ))
-                .unwrap();
+            Box::new(|base_url, idx, length| {
+                let url = Url::parse(&format!("{base_url}/test-{idx}-http/cpu-intensive")).unwrap();
                 let json_body = json!({"length": length}).to_string();
                 let mut request = Request::new(Method::POST, url);
                 *request.body_mut() = Some(Body::wrap(json_body));
                 request
             }),
-            Box::new(|port, idx, length| {
-                let url = Url::parse(&format!(
-                    "http://localhost:{port}/rust/test-{idx}-http/cpu-intensive"
-                ))
-                .unwrap();
+            Box::new(|base_url, idx, length| {
+                let url =
+                    Url::parse(&format!("{base_url}/rust/test-{idx}-http/cpu-intensive")).unwrap();
                 let json_body = json!({"length": length}).to_string();
                 let mut request = Request::new(Method::POST, url);
                 *request.body_mut() = Some(Body::wrap(json_body));
@@ -402,14 +393,20 @@ impl AgentInvocationTarget {
         }
     }
 
-    pub fn prefix(&self, prefix: &str, routing_table: &RoutingTable) -> String {
+    pub fn prefix(&self, prefix: &str, routing_table: &Option<RoutingTable>) -> String {
         match self {
             AgentInvocationTarget::Single { .. } => prefix.to_string(),
             AgentInvocationTarget::Pair { pair, .. } => {
-                if pair.at_same_worker_executor(routing_table) {
-                    format!("{prefix}local-")
+                if let Some(rt) = routing_table {
+                    if pair.at_same_worker_executor(rt) {
+                        format!("{prefix}local-")
+                    } else {
+                        format!("{prefix}remote-")
+                    }
                 } else {
-                    format!("{prefix}remote-")
+                    // Routing table not available (no shard-manager port-forward
+                    // configured); all RPC pairs go into a single unlabeled bucket.
+                    prefix.to_string()
                 }
             }
         }
@@ -426,19 +423,35 @@ pub struct IterationContext {
     rust_agent_ids_for_http: Vec<ParsedAgentId>,
     ts_agent_ids_for_http: Vec<ParsedAgentId>,
     length: usize,
-    routing_table: RoutingTable,
+    /// `None` when shard-manager host/port are not configured (cloud mode
+    /// without port-forward). When `None`, RPC pairs go into a single unlabeled
+    /// bucket instead of being split into local/remote.
+    routing_table: Option<RoutingTable>,
     ts_rpc_agent_id_pairs: Vec<AgentIdPair>,
     rust_rpc_agent_id_pairs: Vec<AgentIdPair>,
+    env_id: EnvironmentId,
 }
 
+/// Type for HTTP request builder closures used by the throughput benchmark.
+/// Receives `(base_url, agent_index, length)` where `base_url` is the full
+/// scheme+host+port prefix (e.g. `http://localhost:8084` in local mode or
+/// `https://myenv.apps.golem.dev` in cloud mode).
+type HttpRequestFn = Box<dyn for<'a> Fn(&'a str, usize, usize) -> Request + Send + Sync + 'static>;
+
 pub struct ThroughputBenchmark {
     rust_method_name: String,
     ts_method_name: String,
     agent_params: Box<dyn Fn(usize) -> DataValue + Send + Sync + 'static>,
-    http_request: Box<dyn Fn(u16, usize, usize) -> Request + Send + Sync + 'static>,
-    rust_http_request: Box<dyn Fn(u16, usize, usize) -> Request + Send + Sync + 'static>,
+    http_request: HttpRequestFn,
+    rust_http_request: HttpRequestFn,
     deps: BenchmarkTestDependencies,
     call_count: usize,
+    /// Pre-built HTTP client for cloud-mode apps-domain calls
+    /// (`https://{env_id}.{apps_base_domain}`).  Cached here so the
+    /// connection pool is warm across benchmark iterations.
+    /// `None` in local/provided mode (client is built per-iteration from the
+    /// custom-request port with a Host header override).
+    cloud_http_client: Option<reqwest::Client>,
 }
 
 fn agent_ids_to_agent_ids(component_id: ComponentId, ids: &[ParsedAgentId]) -> Vec<AgentId> {
@@ -452,8 +465,8 @@ impl ThroughputBenchmark {
         rust_method_name: &str,
         ts_method_name: &str,
         agent_params: Box<dyn Fn(usize) -> DataValue + Send + Sync + 'static>,
-        http_request: Box<dyn Fn(u16, usize, usize) -> Request + Send + Sync + 'static>,
-        rust_http_request: Box<dyn Fn(u16, usize, usize) -> Request + Send + Sync + 'static>,
+        http_request: HttpRequestFn,
+        rust_http_request: HttpRequestFn,
         mode: &TestMode,
         verbosity: Level,
         cluster_size: usize,
@@ -461,21 +474,40 @@ impl ThroughputBenchmark {
         call_count: usize,
         otlp: bool,
     ) -> Self {
+        let deps = BenchmarkTestDependencies::new(
+            mode,
+            verbosity,
+            cluster_size,
+            disable_compilation_cache,
+            otlp,
+        )
+        .await;
+
+        // Build the cloud HTTP client once so the connection pool stays alive
+        // across all benchmark iterations.  In cloud mode requests go to
+        // https://{env_id}.{apps_base_domain}, so we use standard TLS with
+        // ALPN negotiation — NOT http2_prior_knowledge() which is for h2c
+        // (cleartext HTTP/2) and would bypass the ALPN step that the NLB
+        // terminating TLS expects.
+        let cloud_http_client = deps.apps_base_domain().map(|_| {
+            reqwest::ClientBuilder::new()
+                .pool_max_idle_per_host(1024)
+                .pool_idle_timeout(std::time::Duration::from_secs(90))
+                .tcp_nodelay(true)
+                .timeout(std::time::Duration::from_secs(180))
+                .build()
+                .expect("Failed to build cloud HTTP client for throughput benchmark")
+        });
+
         Self {
             rust_method_name: rust_method_name.to_string(),
             ts_method_name: ts_method_name.to_string(),
             agent_params,
             http_request,
             rust_http_request,
-            deps: BenchmarkTestDependencies::new(
-                mode,
-                verbosity,
-                cluster_size,
-                disable_compilation_cache,
-                otlp,
-            )
-            .await,
+            deps,
             call_count,
+            cloud_http_client,
         }
     }
 
@@ -491,13 +523,23 @@ impl ThroughputBenchmark {
         let mut ts_rpc_agent_id_pairs = vec![];
         let mut rust_rpc_agent_id_pairs = vec![];
 
-        let routing_table = self
-            .deps
-            .shard_manager()
-            .get_routing_table()
-            .await
-            .expect("Failed to get routing table");
-        info!("Fetched routing table: {routing_table}");
+        // Fetch routing table when shard-manager is configured; fall back to
+        // None (unlabeled single-bucket RPC) when not configured (e.g. cloud
+        // mode without a port-forward to the shard-manager).
+        let routing_table: Option<RoutingTable> =
+            match self.deps.shard_manager().get_routing_table().await {
+                Ok(rt) => {
+                    info!("Fetched routing table: {rt}");
+                    Some(rt)
+                }
+                Err(err) => {
+                    info!(
+                        "Shard-manager not available, skipping routing table (RPC pairs \
+                         will be unlabeled): {err:#}"
+                    );
+                    None
+                }
+            };
 
         let user = self.deps.user().await.unwrap();
         let (_, env) = user.app_and_env().await.unwrap();
@@ -542,7 +584,14 @@ impl ThroughputBenchmark {
 
         let client = user.registry_service_client().await;
 
-        let domain = Domain(format!("{}.golem.cloud", env.id));
+        // In cloud mode, use the configured apps_base_domain. Fall back to
+        // "golem.cloud" for local/provided modes.
+        let apps_base_domain = self
+            .deps
+            .apps_base_domain()
+            .unwrap_or("golem.cloud")
+            .to_string();
+        let domain = Domain(format!("{}.{}", env.id, apps_base_domain));
 
         async {
             client
@@ -605,6 +654,7 @@ impl ThroughputBenchmark {
             routing_table,
             ts_rpc_agent_id_pairs,
             rust_rpc_agent_id_pairs,
+            env_id: env.id,
         }
     }
 
@@ -713,7 +763,7 @@ impl ThroughputBenchmark {
     pub async fn run(&self, iteration: &IterationContext, recorder: BenchmarkRecorder) {
         async fn measure_agents(
             user: &TestUserContext<BenchmarkTestDependencies>,
-            routing_table: &RoutingTable,
+            routing_table: &Option<RoutingTable>,
             recorder: &BenchmarkRecorder,
             length: usize,
             call_count: usize,
@@ -799,31 +849,51 @@ impl ThroughputBenchmark {
         .instrument(tracing::info_span!("measure_ts_agents"))
         .await;
 
-        let port = self.deps.worker_service().custom_request_port();
-
-        let client = {
-            let mut headers = HeaderMap::new();
-            headers.insert("Host", HeaderValue::from_str(&iteration.domain.0).unwrap());
-            reqwest::Client::builder()
-                .default_headers(headers)
-                .build()
-                .expect("Failed to create HTTP client")
-        };
+        // Resolve the base URL prefix and HTTP client for the code-first HTTP
+        // API benchmark paths. The request-builder closures append the route
+        // path (e.g. "/test-0-http/echo/...") to this prefix.
+        //
+        //   cloud mode:  base = "https://{env_id}.apps.dev.golem.cloud"
+        //                → reqwest connects directly to that host (TLS/SNI +
+        //                  Host set from the URL); the apps gateway routes it
+        //                  to worker-service. Uses the cached, pool-warm client.
+        //
+        //   local mode:  base = "http://localhost:{custom_request_port}"
+        //                → reqwest connects to localhost; an explicit Host
+        //                  header ("{env_id}.golem.cloud") tells the local
+        //                  worker-service which deployment to route to.
+        let (http_base_url, client): (String, reqwest::Client) =
+            if let Some(ref cached) = self.cloud_http_client {
+                let base = format!("https://{}", iteration.domain.0);
+                (base, cached.clone())
+            } else {
+                let port = self.deps.worker_service().custom_request_port();
+                let base = format!("http://localhost:{port}");
+                let mut headers = HeaderMap::new();
+                headers.insert("Host", HeaderValue::from_str(&iteration.domain.0).unwrap());
+                let c = reqwest::Client::builder()
+                    .default_headers(headers)
+                    .build()
+                    .expect("Failed to create HTTP client");
+                (base, c)
+            };
 
         async {
             let client = client.clone();
+            let base = http_base_url.clone();
             let result_futures = iteration
                 .rust_agent_ids_for_http
                 .iter()
                 .enumerate()
                 .map(move |(idx, _agent_id)| {
                     let client = client.clone();
+                    let base = base.clone();
                     async move {
                         let mut results = vec![];
                         for _ in 0..self.call_count {
                             results.push(
                                 invoke_and_await_http(client.clone(), || {
-                                    (self.rust_http_request)(port, idx, iteration.length)
+                                    (self.rust_http_request)(&base, idx, iteration.length)
                                 })
                                 .await,
                             )
@@ -850,12 +920,13 @@ impl ThroughputBenchmark {
                 .enumerate()
                 .map(move |(idx, _agent_id)| {
                     let client = client.clone();
+                    let base = http_base_url.clone();
                     async move {
                         let mut results = vec![];
                         for _ in 0..self.call_count {
                             results.push(
                                 invoke_and_await_http(client.clone(), || {
-                                    (self.http_request)(port, idx, iteration.length)
+                                    (self.http_request)(&base, idx, iteration.length)
                                 })
                                 .await,
                             )
@@ -969,5 +1040,6 @@ impl ThroughputBenchmark {
             }
         }
         delete_workers(&iteration.user, &rust_rpc_workers).await;
+        cleanup_user_state(&iteration.user, &iteration.env_id).await;
     }
 }

From 341bab33e3e55fd2b4214d8f0affd036d5f76c6c Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 2 Jun 2026 22:15:18 -0700
Subject: [PATCH 02/60] feat: add run specific details to perf tests

---
 golem-test-framework/src/benchmark/mod.rs     |  4 +-
 golem-test-framework/src/benchmark/results.rs | 99 +++++++++++++++++++
 .../benchmark_suites/cloud-perf.yaml          | 28 ++++++
 integration-tests/src/benchmarks/all.rs       | 11 ++-
 4 files changed, 139 insertions(+), 3 deletions(-)
 create mode 100644 integration-tests/benchmark_suites/cloud-perf.yaml

diff --git a/golem-test-framework/src/benchmark/mod.rs b/golem-test-framework/src/benchmark/mod.rs
index fd246b2be7..5e82adde15 100644
--- a/golem-test-framework/src/benchmark/mod.rs
+++ b/golem-test-framework/src/benchmark/mod.rs
@@ -16,7 +16,9 @@ mod config;
 mod results;
 
 pub use config::{BenchmarkConfig, BenchmarkSuite, BenchmarkSuiteItem, RunConfig};
-pub use results::{BenchmarkResult, BenchmarkRunResult, BenchmarkSuiteResult, ResultKey};
+pub use results::{
+    BenchmarkResult, BenchmarkRunResult, BenchmarkSuiteResult, ResultKey, RunMetadata,
+};
 
 use crate::config::benchmark::TestMode;
 use async_trait::async_trait;
diff --git a/golem-test-framework/src/benchmark/results.rs b/golem-test-framework/src/benchmark/results.rs
index a309d1d5e5..afb7319a7d 100644
--- a/golem-test-framework/src/benchmark/results.rs
+++ b/golem-test-framework/src/benchmark/results.rs
@@ -484,6 +484,97 @@ impl Display for BenchmarkResultView {
     }
 }
 
+/// Cloud-mode run metadata collected by the buildspec and passed via environment variables.
+/// All fields are optional — missing env vars produce `None` rather than failing the run.
+#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct RunMetadata {
+    /// The `golem-oss` commit SHA that was built and deployed.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub golem_oss_commit_sha: Option<String>,
+    /// The `golem-cloud` (kubernetes manifests) commit SHA that was deployed.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub kubernetes_manifest_commit_sha: Option<String>,
+    /// Number of Ready `worker-executor` pods observed at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub observed_cluster_size: Option<u32>,
+    /// Container image tag of the deployed `worker-executor`.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub worker_executor_image_tag: Option<String>,
+    /// Container image tag of the deployed `registry-service`.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub registry_service_image_tag: Option<String>,
+    /// Container image tag of the deployed `worker-service`.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub worker_service_image_tag: Option<String>,
+    /// Aurora ACU capacity for the main (`golem_dev`) cluster at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub aurora_acu_main: Option<f64>,
+    /// Aurora ACU capacity for the indexed-storage cluster at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub aurora_acu_indexed: Option<f64>,
+    /// Aurora ACU capacity for the keyvalue-storage cluster at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub aurora_acu_keyvalue: Option<f64>,
+    /// Ready replica count for `worker-executor` at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub worker_executor_replicas: Option<u32>,
+    /// Ready replica count for `worker-service` at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub worker_service_replicas: Option<u32>,
+    /// Ready replica count for `registry-service` at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub registry_service_replicas: Option<u32>,
+    /// Ready replica count for `compilation-service` at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub compilation_service_replicas: Option<u32>,
+    /// Ready replica count for `debugging-service` at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub debugging_service_replicas: Option<u32>,
+    /// Free-form note from the `workflow_dispatch` trigger.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub note: Option<String>,
+}
+
+impl RunMetadata {
+    /// Reads all `GOLEM_BENCH_*` environment variables and returns a populated
+    /// `RunMetadata`.  Missing variables produce `None` for that field.
+    pub fn from_env() -> Self {
+        fn env_str(key: &str) -> Option<String> {
+            std::env::var(key).ok().filter(|v| !v.is_empty())
+        }
+        fn env_u32(key: &str) -> Option<u32> {
+            env_str(key).and_then(|v| v.parse().ok())
+        }
+        fn env_f64(key: &str) -> Option<f64> {
+            env_str(key).and_then(|v| v.parse().ok())
+        }
+
+        Self {
+            golem_oss_commit_sha: env_str("GOLEM_BENCH_OSS_COMMIT_SHA"),
+            kubernetes_manifest_commit_sha: env_str("GOLEM_BENCH_K8S_MANIFEST_COMMIT_SHA"),
+            observed_cluster_size: env_u32("GOLEM_BENCH_OBSERVED_CLUSTER_SIZE"),
+            worker_executor_image_tag: env_str("GOLEM_BENCH_WORKER_EXECUTOR_IMAGE_TAG"),
+            registry_service_image_tag: env_str("GOLEM_BENCH_REGISTRY_SERVICE_IMAGE_TAG"),
+            worker_service_image_tag: env_str("GOLEM_BENCH_WORKER_SERVICE_IMAGE_TAG"),
+            aurora_acu_main: env_f64("GOLEM_BENCH_AURORA_ACU_MAIN"),
+            aurora_acu_indexed: env_f64("GOLEM_BENCH_AURORA_ACU_INDEXED"),
+            aurora_acu_keyvalue: env_f64("GOLEM_BENCH_AURORA_ACU_KEYVALUE"),
+            worker_executor_replicas: env_u32("GOLEM_BENCH_WORKER_EXECUTOR_REPLICAS"),
+            worker_service_replicas: env_u32("GOLEM_BENCH_WORKER_SERVICE_REPLICAS"),
+            registry_service_replicas: env_u32("GOLEM_BENCH_REGISTRY_SERVICE_REPLICAS"),
+            compilation_service_replicas: env_u32("GOLEM_BENCH_COMPILATION_SERVICE_REPLICAS"),
+            debugging_service_replicas: env_u32("GOLEM_BENCH_DEBUGGING_SERVICE_REPLICAS"),
+            note: env_str("GOLEM_BENCH_RUN_NOTE"),
+        }
+    }
+
+    /// Returns `true` if every field is `None` (nothing was read from env).
+    pub fn is_empty(&self) -> bool {
+        self == &Self::default()
+    }
+}
+
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct BenchmarkSuiteResultCollection {
     pub runs: Vec<BenchmarkSuiteResult>,
@@ -491,6 +582,8 @@ pub struct BenchmarkSuiteResultCollection {
 
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct BenchmarkSuiteResult {
+    /// Result format version. Always `1` for results produced by this binary.
+    pub schema_version: u32,
     pub suite: String,
     pub environment: String,
     pub version: String,
@@ -499,6 +592,10 @@ pub struct BenchmarkSuiteResult {
     /// cross-run correlation and garbage collection of orphaned state.
     #[serde(skip_serializing_if = "Option::is_none", default)]
     pub run_id: Option<String>,
+    /// Cloud-mode run metadata populated from `GOLEM_BENCH_*` environment variables.
+    /// `None` in Spawned or Provided modes where cluster metadata is not available.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub run_metadata: Option<RunMetadata>,
     pub results: Vec<BenchmarkResult>,
 }
 
@@ -530,11 +627,13 @@ impl BenchmarkSuiteResult {
         );
 
         Self {
+            schema_version: 1,
             suite: suite.to_string(),
             environment,
             version: golem_common::golem_version().to_string(),
             timestamp: Utc::now(),
             run_id: None,
+            run_metadata: None,
             results: vec![],
         }
     }
diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
new file mode 100644
index 0000000000..5387f35da6
--- /dev/null
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -0,0 +1,28 @@
+# Cloud-perf benchmark suite — runs the existing benchmark suite against a
+# deployed Golem environment via Gateway-API hostnames (TestMode::Cloud).
+#
+# Run with the benchmarks binary's `cloud` subcommand:
+#
+#   benchmarks suite integration-tests/benchmark_suites/cloud-perf.yaml \
+#     --save-to-json result.json \
+#     cloud \
+#       --api-url https://<your-golem-api-host> \
+#       --apps-base-domain <your-apps-base-domain> \
+#       --admin-account-token <token> \
+#       --builtin-plugin-owner-account-id <uuid> \
+#       --default-plan-id <uuid> \
+#       --component-directory <path-to-wasm-components>
+#
+# Initial milestone: latency-small only.  Add the remaining six benchmarks
+# once this runs end-to-end successfully.
+
+name: cloud-perf
+benchmarks:
+  # Measures cold and hot invocation latency through the Gateway using the
+  # small Rust agent (benchmark_agent_rust_release.wasm).  Three iterations
+  # capture real-network jitter and load-balancer warm-up variance.
+  - name: latency-small
+    iterations: 3
+    clusterSize: [2]
+    size: [5, 10]
+    length: [2]
diff --git a/integration-tests/src/benchmarks/all.rs b/integration-tests/src/benchmarks/all.rs
index 6865ecf6e4..9b1efd1eb2 100644
--- a/integration-tests/src/benchmarks/all.rs
+++ b/integration-tests/src/benchmarks/all.rs
@@ -21,7 +21,7 @@ use golem_common::model::environment::{EnvironmentCreation, EnvironmentName};
 use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{
     Benchmark, BenchmarkApi, BenchmarkConfig, BenchmarkResult, BenchmarkSuite, BenchmarkSuiteItem,
-    BenchmarkSuiteResult,
+    BenchmarkSuiteResult, RunMetadata,
 };
 use golem_test_framework::config::benchmark::{TestMode, cloud_bench_run_id};
 use golem_test_framework::config::{
@@ -233,9 +233,16 @@ async fn main() {
                 // no else: we already validated all names above
             }
 
-            // Attach the run_id to result metadata (cloud mode only).
+            // Attach the run_id and run_metadata to result metadata (cloud mode only).
             if let Some(run_id) = cloud_bench_run_id() {
                 suite_result.run_id = Some(format!("bench-{run_id}"));
+
+                // Read GOLEM_BENCH_* env vars set by the buildspec before invoking
+                // the binary. Missing vars produce None rather than failing the run.
+                let metadata = RunMetadata::from_env();
+                if !metadata.is_empty() {
+                    suite_result.run_metadata = Some(metadata);
+                }
             }
 
             if let Some(path) = save_to_json {

From b1764ece0906ebdbae8d02fd4b1c8436468113fe Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Thu, 4 Jun 2026 16:25:01 -0700
Subject: [PATCH 03/60] fix(benchmark): make --builtin-plugin-owner-account-id
 and --default-plan-id optional

---
 golem-test-framework/src/config/benchmark.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/golem-test-framework/src/config/benchmark.rs b/golem-test-framework/src/config/benchmark.rs
index e97a58c47d..34ac140d23 100644
--- a/golem-test-framework/src/config/benchmark.rs
+++ b/golem-test-framework/src/config/benchmark.rs
@@ -280,10 +280,14 @@ pub enum TestMode {
         #[arg(long)]
         admin_account_token: String,
         /// UUID of the builtin-plugin-owner account.
-        #[arg(long)]
+        /// Only needed for environment-plugin-grant tests; benchmarks do not
+        /// use it so the default (nil UUID) is fine for benchmark runs.
+        #[arg(long, default_value_t = Uuid::nil())]
         builtin_plugin_owner_account_id: Uuid,
         /// UUID of the default plan on the target cluster.
-        #[arg(long)]
+        /// Only needed for environment-plugin-grant tests; benchmarks do not
+        /// use it so the default (nil UUID) is fine for benchmark runs.
+        #[arg(long, default_value_t = Uuid::nil())]
         default_plan_id: Uuid,
         /// Optional shard-manager gRPC hostname for a kubectl port-forward
         /// (e.g. `localhost`). When set together with

From 4294bdbe29bf9a04f54394da9778a5cf95d48232 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Thu, 4 Jun 2026 19:16:43 -0700
Subject: [PATCH 04/60] fix: make ProvidedShardManager kill/restart no-ops
 instead of panics

kill_all() is called after cloud_preflight_warmup completes. ProvidedShardManager
wraps an already-running process we don't own, so neither kill nor restart should
crash the binary. Both are now silent no-ops, matching UnavailableShardManager.
---
 golem-test-framework/src/components/shard_manager/provided.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/golem-test-framework/src/components/shard_manager/provided.rs b/golem-test-framework/src/components/shard_manager/provided.rs
index d7e4ff1305..84d213a5fb 100644
--- a/golem-test-framework/src/components/shard_manager/provided.rs
+++ b/golem-test-framework/src/components/shard_manager/provided.rs
@@ -40,10 +40,10 @@ impl ShardManager for ProvidedShardManager {
     }
 
     async fn kill(&self) {
-        panic!("Cannot kill provided shard manager");
+        // Nothing to do — we do not own this shard manager process.
     }
 
     async fn restart(&self, _number_of_shards_override: Option<usize>) {
-        panic!("Cannot restart provided shard manager");
+        // Nothing to do — we do not own this shard manager process.
     }
 }

From 5b9902b55875d772d16e1fc004160d13caf6d8a1 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 10:44:57 -0700
Subject: [PATCH 05/60] feat(benchmark): enable all tests

---
 .../benchmark_suites/cloud-perf.yaml          | 101 ++++++++++++++++--
 1 file changed, 94 insertions(+), 7 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index 5387f35da6..9805b18e6f 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -1,4 +1,4 @@
-# Cloud-perf benchmark suite — runs the existing benchmark suite against a
+# Cloud-perf benchmark suite — runs the full benchmark suite against a
 # deployed Golem environment via Gateway-API hostnames (TestMode::Cloud).
 #
 # Run with the benchmarks binary's `cloud` subcommand:
@@ -13,16 +13,103 @@
 #       --default-plan-id <uuid> \
 #       --component-directory <path-to-wasm-components>
 #
-# Initial milestone: latency-small only.  Add the remaining six benchmarks
-# once this runs end-to-end successfully.
+# Note: clusterSize is ignored in Cloud mode (the observed cluster size is
+# read from shard-manager at run start and recorded in result metadata).
 
 name: cloud-perf
 benchmarks:
-  # Measures cold and hot invocation latency through the Gateway using the
-  # small Rust agent (benchmark_agent_rust_release.wasm).  Three iterations
-  # capture real-network jitter and load-balancer warm-up variance.
+  # Cold-start: compilation cache disabled — measures true cold-start latency
+  # with no warm compiled artefact available.
+  # size   = number of unique components created (each in its own env)
+  # length = seconds to wait per component for pre-compilation warm-up
+  - name: cold-start-unknown-small
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [2]
+    disableCompilationCache: true
+
+  - name: cold-start-unknown-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [5]
+    disableCompilationCache: true
+
+  # Cold-start: compilation cache enabled — measures latency once the compiled
+  # artefact is available in the cache.
+  # size   = number of unique components created (each in its own env)
+  # length = seconds to wait per component for pre-compilation warm-up
+  - name: cold-start-unknown-small
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [2]
+
+  - name: cold-start-unknown-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [5]
+
+  # Invocation latency — hot and cold paths through the Gateway NLB.
+  # Large worker counts to stress the load balancer and connection pool.
+  # size   = number of workers created
+  # length = number of hot invocations per worker after the first cold one
   - name: latency-small
     iterations: 3
     clusterSize: [2]
-    size: [5, 10]
+    size: [100, 500, 1000, 2000, 5000]
     length: [2]
+
+  - name: latency-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [100, 500, 1000, 2000]
+    length: [5]
+
+  # Sleep — measures worker suspension and resumption under real network
+  # conditions.
+  # size   = number of workers launched in parallel
+  # length = sleep duration in milliseconds
+  - name: sleep
+    iterations: 3
+    clusterSize: [2]
+    size: [10, 100, 500, 1000]
+    length: [10000]
+
+  # Durability overhead — measures the cost of durable vs ephemeral execution
+  # across four variants (durable-persistent, durable-non-persistent,
+  # ephemeral, durable-persistent-commit).
+  # size   = number of workers per variant
+  # length = loop iteration count passed to oplog_heavy
+  - name: durability-overhead
+    iterations: 3
+    clusterSize: [2]
+    size: [10, 50, 100, 200]
+    length: [5000]
+
+  # Throughput — measures invocation throughput across six implementations.
+  # size   = number of workers per implementation
+  # length = unused for echo
+  - name: throughput-echo
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 10, 50, 100, 250]
+    length: [1000]
+
+  # size   = number of workers per implementation
+  # length = payload size in bytes sent to large_input
+  - name: throughput-large-input
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 10, 25, 50]
+    length: [100, 10000]
+
+  # size   = number of workers per implementation
+  # length = CPU work length passed to cpu_intensive
+  - name: throughput-cpu-intensive
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 10, 25, 50]
+    length: [100]

From 742a6695ad564e942026f42caec01b1dad075b5c Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 13:34:01 -0700
Subject: [PATCH 06/60] feat: retry connectivity to shard manager

---
 .../src/components/shard_manager/mod.rs       | 70 ++++++++++++++-----
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/golem-test-framework/src/components/shard_manager/mod.rs b/golem-test-framework/src/components/shard_manager/mod.rs
index 67b10051bf..bb387cd1ac 100644
--- a/golem-test-framework/src/components/shard_manager/mod.rs
+++ b/golem-test-framework/src/components/shard_manager/mod.rs
@@ -31,7 +31,7 @@ use std::sync::Arc;
 use std::time::Duration;
 use tonic::codec::CompressionEncoding;
 use tonic::transport::Channel;
-use tracing::Level;
+use tracing::{Level, warn};
 
 #[async_trait]
 pub trait ShardManager: Send + Sync {
@@ -47,25 +47,30 @@ pub trait ShardManager: Send + Sync {
     async fn restart(&self, number_of_shards_override: Option<usize>);
 
     async fn get_routing_table(&self) -> crate::Result<RoutingTable> {
-        let routing_table = self
-            .client()
-            .await
-            .get_routing_table(GetRoutingTableRequest {})
-            .await
-            .expect("Unable to fetch the routing table from shard-manager-service");
-
-        match routing_table.into_inner() {
-            shardmanager::v1::GetRoutingTableResponse {
-                result:
-                    Some(shardmanager::v1::get_routing_table_response::Result::Success(routing_table)),
-            } => Ok(routing_table
-                .try_into()
-                .map_err(|e| anyhow!("Failed converting routing table: {e}"))?),
-            shardmanager::v1::GetRoutingTableResponse {
-                result: Some(shardmanager::v1::get_routing_table_response::Result::Failure(err)),
-            } => Err(anyhow!("Failed to get routing table: {err:?}")),
-            _ => Err(anyhow!("Failed to get routing table")),
+        // Retry with backoff to tolerate transient port-forward reconnects.
+        // The port-forward watchdog restarts in ~500ms, so 10 attempts with
+        // 1s delay gives ~10s of tolerance before giving up.
+        let max_attempts = 10;
+        let retry_delay = Duration::from_secs(1);
+        let mut last_err = anyhow!("get_routing_table: no attempts made");
+
+        for attempt in 1..=max_attempts {
+            match try_get_routing_table(&self.grpc_host(), self.grpc_port()).await {
+                Ok(rt) => return Ok(rt),
+                Err(err) => {
+                    warn!(
+                        attempt,
+                        max_attempts,
+                        error = %err,
+                        "Failed to fetch routing table, retrying..."
+                    );
+                    last_err = err;
+                    tokio::time::sleep(retry_delay).await;
+                }
+            }
         }
+
+        Err(last_err)
     }
 }
 
@@ -77,6 +82,33 @@ async fn new_client(host: &str, grpc_port: u16) -> ShardManagerServiceClient<Cha
         .accept_compressed(CompressionEncoding::Gzip)
 }
 
+async fn try_get_routing_table(host: &str, grpc_port: u16) -> crate::Result<RoutingTable> {
+    let mut client =
+        ShardManagerServiceClient::connect(format!("http://{host}:{grpc_port}"))
+            .await
+            .map_err(|e| anyhow!("Failed to connect to shard-manager: {e}"))?
+            .send_compressed(CompressionEncoding::Gzip)
+            .accept_compressed(CompressionEncoding::Gzip);
+
+    let routing_table = client
+        .get_routing_table(GetRoutingTableRequest {})
+        .await
+        .map_err(|e| anyhow!("Unable to fetch the routing table from shard-manager-service: {e}"))?;
+
+    match routing_table.into_inner() {
+        shardmanager::v1::GetRoutingTableResponse {
+            result:
+                Some(shardmanager::v1::get_routing_table_response::Result::Success(routing_table)),
+        } => Ok(routing_table
+            .try_into()
+            .map_err(|e| anyhow!("Failed converting routing table: {e}"))?),
+        shardmanager::v1::GetRoutingTableResponse {
+            result: Some(shardmanager::v1::get_routing_table_response::Result::Failure(err)),
+        } => Err(anyhow!("Failed to get routing table: {err:?}")),
+        _ => Err(anyhow!("Failed to get routing table")),
+    }
+}
+
 async fn wait_for_startup(
     host: &str,
     grpc_port: u16,

From 18d5af6950c06aa02409dfd96449c35213884302 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 14:15:16 -0700
Subject: [PATCH 07/60] chore: fmt

---
 .../src/components/shard_manager/mod.rs           | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/golem-test-framework/src/components/shard_manager/mod.rs b/golem-test-framework/src/components/shard_manager/mod.rs
index bb387cd1ac..91ed2ed2da 100644
--- a/golem-test-framework/src/components/shard_manager/mod.rs
+++ b/golem-test-framework/src/components/shard_manager/mod.rs
@@ -83,17 +83,18 @@ async fn new_client(host: &str, grpc_port: u16) -> ShardManagerServiceClient<Cha
 }
 
 async fn try_get_routing_table(host: &str, grpc_port: u16) -> crate::Result<RoutingTable> {
-    let mut client =
-        ShardManagerServiceClient::connect(format!("http://{host}:{grpc_port}"))
-            .await
-            .map_err(|e| anyhow!("Failed to connect to shard-manager: {e}"))?
-            .send_compressed(CompressionEncoding::Gzip)
-            .accept_compressed(CompressionEncoding::Gzip);
+    let mut client = ShardManagerServiceClient::connect(format!("http://{host}:{grpc_port}"))
+        .await
+        .map_err(|e| anyhow!("Failed to connect to shard-manager: {e}"))?
+        .send_compressed(CompressionEncoding::Gzip)
+        .accept_compressed(CompressionEncoding::Gzip);
 
     let routing_table = client
         .get_routing_table(GetRoutingTableRequest {})
         .await
-        .map_err(|e| anyhow!("Unable to fetch the routing table from shard-manager-service: {e}"))?;
+        .map_err(|e| {
+            anyhow!("Unable to fetch the routing table from shard-manager-service: {e}")
+        })?;
 
     match routing_table.into_inner() {
         shardmanager::v1::GetRoutingTableResponse {

From 395bcd2113017d99a398ff5a32ab072215c662dd Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 15:48:54 -0700
Subject: [PATCH 08/60] investigation: run echo test first to see if they get
 stuck again

---
 .../benchmark_suites/cloud-perf.yaml          | 59 +++++++++++--------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index 9805b18e6f..6e258c3c99 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -15,9 +15,41 @@
 #
 # Note: clusterSize is ignored in Cloud mode (the observed cluster size is
 # read from shard-manager at run start and recorded in result metadata).
+#
+# Suite order rationale: throughput benchmarks run first because they involve
+# RPC worker pairs and HTTP deployments — the most complex setup. Running them
+# early surfaces infrastructure issues (stuck workers, port-forward drops)
+# before spending time on the simpler benchmarks.
 
 name: cloud-perf
 benchmarks:
+  # Throughput — measures invocation throughput across six implementations:
+  # rust agent (gRPC), TS agent (gRPC), rust agent (HTTP), TS agent (HTTP),
+  # TS RPC pair, rust RPC pair.
+  # size   = number of workers per implementation (×6 implementations total)
+  # length = unused for echo
+  - name: throughput-echo
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 10, 50, 100, 250]
+    length: [1000]
+
+  # size   = number of workers per implementation
+  # length = payload size in bytes sent to large_input
+  - name: throughput-large-input
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 10, 25, 50]
+    length: [100, 10000]
+
+  # size   = number of workers per implementation
+  # length = CPU work length passed to cpu_intensive
+  - name: throughput-cpu-intensive
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 10, 25, 50]
+    length: [100]
+
   # Cold-start: compilation cache disabled — measures true cold-start latency
   # with no warm compiled artefact available.
   # size   = number of unique components created (each in its own env)
@@ -40,6 +72,8 @@ benchmarks:
   # artefact is available in the cache.
   # size   = number of unique components created (each in its own env)
   # length = seconds to wait per component for pre-compilation warm-up
+  # NOTE: if results here are close to the cache-disabled entries above, the
+  # warm-up wait is too short and compilation hasn't finished — bump length.
   - name: cold-start-unknown-small
     iterations: 3
     clusterSize: [2]
@@ -88,28 +122,3 @@ benchmarks:
     clusterSize: [2]
     size: [10, 50, 100, 200]
     length: [5000]
-
-  # Throughput — measures invocation throughput across six implementations.
-  # size   = number of workers per implementation
-  # length = unused for echo
-  - name: throughput-echo
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 10, 50, 100, 250]
-    length: [1000]
-
-  # size   = number of workers per implementation
-  # length = payload size in bytes sent to large_input
-  - name: throughput-large-input
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 10, 25, 50]
-    length: [100, 10000]
-
-  # size   = number of workers per implementation
-  # length = CPU work length passed to cpu_intensive
-  - name: throughput-cpu-intensive
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 10, 25, 50]
-    length: [100]

From dac3c697826b2bc25192b5c14cb3e40ef072e821 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 19:28:27 -0700
Subject: [PATCH 09/60] feat(benchmark): lower number of conccurent live apps

---
 integration-tests/benchmark_suites/cloud-perf.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index 6e258c3c99..b91a3d1821 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -31,7 +31,7 @@ benchmarks:
   - name: throughput-echo
     iterations: 3
     clusterSize: [2]
-    size: [1, 10, 50, 100, 250]
+    size: [1, 10, 50, 100]
     length: [1000]
 
   # size   = number of workers per implementation

From 22566231cfd3e148fde30c84253fa597b366c378 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 21:53:40 -0700
Subject: [PATCH 10/60] feat: more observability, make memory component
 coefficient configurable

---
 .../config/debug-worker-executor.sample.env   |  2 +
 .../config/debug-worker-executor.toml         |  2 +
 .../config/worker-executor.sample.env         |  3 ++
 .../config/worker-executor.toml               |  3 ++
 golem-worker-executor/src/metrics.rs          | 42 +++++++++++++------
 .../src/services/golem_config.rs              | 12 ++++++
 golem-worker-executor/src/worker/mod.rs       | 10 ++++-
 7 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env
index 077c693c32..66afafc82a 100644
--- a/golem-debugging-service/config/debug-worker-executor.sample.env
+++ b/golem-debugging-service/config/debug-worker-executor.sample.env
@@ -55,6 +55,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
@@ -228,6 +229,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml
index 8ee03a9c23..7191c36996 100644
--- a/golem-debugging-service/config/debug-worker-executor.toml
+++ b/golem-debugging-service/config/debug-worker-executor.toml
@@ -96,6 +96,7 @@ max_oplog_query_pages_size = 100
 
 [memory]
 acquire_retry_delay = "500ms"
+component_size_coefficient = 2.0
 worker_estimate_coefficient = 1.1
 worker_memory_ratio = 0.8
 
@@ -364,6 +365,7 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
+# component_size_coefficient = 2.0
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
 # 
diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env
index 2a52884966..2ef7701cc5 100644
--- a/golem-worker-executor/config/worker-executor.sample.env
+++ b/golem-worker-executor/config/worker-executor.sample.env
@@ -72,6 +72,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
@@ -291,6 +292,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
@@ -480,6 +482,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml
index 4c89275519..b1bab39be9 100644
--- a/golem-worker-executor/config/worker-executor.toml
+++ b/golem-worker-executor/config/worker-executor.toml
@@ -125,6 +125,7 @@ max_oplog_query_pages_size = 100
 
 [memory]
 acquire_retry_delay = "500ms"
+component_size_coefficient = 2.0
 worker_estimate_coefficient = 1.1
 worker_memory_ratio = 0.8
 
@@ -456,6 +457,7 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
+# component_size_coefficient = 2.0
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
 # 
@@ -757,6 +759,7 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
+# component_size_coefficient = 2.0
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
 # 
diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs
index 768b3e6b98..c9d610e79d 100644
--- a/golem-worker-executor/src/metrics.rs
+++ b/golem-worker-executor/src/metrics.rs
@@ -69,18 +69,26 @@ const SCHEDULER_LAG_BUCKETS: &[f64; 11] = &[
     0.001, 0.01, 0.1, 1.0, 5.0, 15.0, 30.0, 60.0, 120.0, 300.0, 600.0,
 ];
 
-const MEMORY_SIZE_BUCKETS: &[f64; 11] = &[
-    1024.0,
-    4096.0,
-    16384.0,
-    65536.0,
-    262144.0,
-    1048576.0,
-    4194304.0,
-    16777216.0,
-    67108864.0,
-    268435456.0,
-    1073741824.0,
+/// Buckets for the size of a single `memory.grow` allocation. Deliberately
+/// fine-grained in the 1-32 MiB band where typical guest grows cluster, so
+/// that p90/p99 quantiles are not pinned to a coarse 4-16 MiB bucket edge.
+const MEMORY_SIZE_BUCKETS: &[f64; 16] = &[
+    65536.0,      // 64 KiB
+    262144.0,     // 256 KiB
+    1048576.0,    // 1 MiB
+    2097152.0,    // 2 MiB
+    4194304.0,    // 4 MiB
+    6291456.0,    // 6 MiB
+    8388608.0,    // 8 MiB
+    12582912.0,   // 12 MiB
+    16777216.0,   // 16 MiB
+    25165824.0,   // 24 MiB
+    33554432.0,   // 32 MiB
+    67108864.0,   // 64 MiB
+    134217728.0,  // 128 MiB
+    268435456.0,  // 256 MiB
+    536870912.0,  // 512 MiB
+    1073741824.0, // 1 GiB
 ];
 
 pub mod component {
@@ -508,6 +516,12 @@ pub mod wasm {
             crate::metrics::MEMORY_SIZE_BUCKETS.to_vec()
         )
         .unwrap();
+        static ref WORKER_RESIDENT_LINEAR_MEMORY_BYTES: Histogram = register_histogram!(
+            "worker_resident_linear_memory_bytes",
+            "Per-worker cumulative linear memory size (total_linear_memory_size) observed when acquiring a memory permit",
+            crate::metrics::MEMORY_SIZE_BUCKETS.to_vec()
+        )
+        .unwrap();
     }
 
     lazy_static! {
@@ -580,6 +594,10 @@ pub mod wasm {
     pub fn record_allocated_memory(amount: usize) {
         ALLOCATED_MEMORY_BYTES.observe(amount as f64);
     }
+
+    pub fn record_worker_resident_linear_memory(bytes: u64) {
+        WORKER_RESIDENT_LINEAR_MEMORY_BYTES.observe(bytes as f64);
+    }
 }
 
 pub mod oplog {
diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs
index 733d9529af..76b7720bf0 100644
--- a/golem-worker-executor/src/services/golem_config.rs
+++ b/golem-worker-executor/src/services/golem_config.rs
@@ -963,6 +963,12 @@ pub struct MemoryConfig {
     pub system_memory_override: Option<u64>,
     pub worker_memory_ratio: f64,
     pub worker_estimate_coefficient: f64,
+    /// Multiplier applied to a worker's `component_size` when estimating its
+    /// memory permit requirement. The compiled component is loaded into the
+    /// engine once per component (shared across all workers of that component),
+    /// so this term over-accounts per-worker memory for large components.
+    /// Lower this (e.g. to 0.0) to size permits primarily off linear memory.
+    pub component_size_coefficient: f64,
     #[serde(with = "humantime_serde")]
     pub acquire_retry_delay: Duration,
     pub oom_retry_config: RetryConfig,
@@ -1004,6 +1010,11 @@ impl SafeDisplay for MemoryConfig {
             "worker estimate coefficient: {}",
             self.worker_estimate_coefficient
         );
+        let _ = writeln!(
+            &mut result,
+            "component size coefficient: {}",
+            self.component_size_coefficient
+        );
         let _ = writeln!(
             &mut result,
             "acquire retry delay: {:?}",
@@ -1528,6 +1539,7 @@ impl Default for MemoryConfig {
             system_memory_override: None,
             worker_memory_ratio: 0.8,
             worker_estimate_coefficient: 1.1,
+            component_size_coefficient: 2.0,
             acquire_retry_delay: Duration::from_millis(500),
             oom_retry_config: RetryConfig {
                 max_attempts: u32::MAX,
diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs
index 1e6d4fa7cc..a65d6dd867 100644
--- a/golem-worker-executor/src/worker/mod.rs
+++ b/golem-worker-executor/src/worker/mod.rs
@@ -122,6 +122,7 @@ pub struct Worker<Ctx: WorkerCtx> {
     execution_status: Arc<std::sync::RwLock<ExecutionStatus>>,
     update_state_lock: Mutex<()>,
     worker_estimate_coefficient: f64,
+    component_size_coefficient: f64,
 
     // IMPORTANT: Every external operation must acquire the instance lock, even briefly, to confirm the worker isn’t deleting.
     instance: Arc<Mutex<WorkerInstance>>,
@@ -340,6 +341,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             last_known_status: current_status,
             metrics_status,
             worker_estimate_coefficient: deps.config().memory.worker_estimate_coefficient,
+            component_size_coefficient: deps.config().memory.component_size_coefficient,
             oom_retry_config: deps.config().memory.oom_retry_config.clone(),
             snapshot_policy,
             update_state_lock: Mutex::new(()),
@@ -410,6 +412,12 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             WorkerInstance::Unloaded { .. } => {
                 this.mark_as_loading();
                 crate::metrics::workers::inc_worker_waiting_for_memory();
+                crate::metrics::wasm::record_worker_resident_linear_memory(
+                    this.get_latest_worker_metadata()
+                        .await
+                        .last_known_status
+                        .total_linear_memory_size,
+                );
                 *instance_guard = WorkerInstance::WaitingForPermit(WaitingWorker::new(
                     this.clone(),
                     this.memory_requirement().await?,
@@ -795,7 +803,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
 
         let ml = metadata.last_known_status.total_linear_memory_size as f64;
         let sw = metadata.last_known_status.component_size as f64;
-        let c = 2.0;
+        let c = self.component_size_coefficient;
         let x = self.worker_estimate_coefficient;
         Ok((x * (ml + c * sw)) as u64)
     }

From 02e527a7ce7d25306a28f98088eef30baf959168 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:45:01 -0700
Subject: [PATCH 11/60] feat(benchmark): run only throughput-echo test

---
 .../benchmark_suites/cloud-perf.yaml          | 184 +++++++++---------
 1 file changed, 96 insertions(+), 88 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index b91a3d1821..9cb7bded85 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -20,6 +20,10 @@
 # RPC worker pairs and HTTP deployments — the most complex setup. Running them
 # early surfaces infrastructure issues (stuck workers, port-forward drops)
 # before spending time on the simpler benchmarks.
+#
+# NOTE: All benchmarks except throughput-echo are temporarily commented out for
+# the memory-semaphore (component_size_coefficient) knob experiment. Restore
+# them to run the full suite.
 
 name: cloud-perf
 benchmarks:
@@ -34,91 +38,95 @@ benchmarks:
     size: [1, 10, 50, 100]
     length: [1000]
 
-  # size   = number of workers per implementation
-  # length = payload size in bytes sent to large_input
-  - name: throughput-large-input
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 10, 25, 50]
-    length: [100, 10000]
-
-  # size   = number of workers per implementation
-  # length = CPU work length passed to cpu_intensive
-  - name: throughput-cpu-intensive
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 10, 25, 50]
-    length: [100]
-
-  # Cold-start: compilation cache disabled — measures true cold-start latency
-  # with no warm compiled artefact available.
-  # size   = number of unique components created (each in its own env)
-  # length = seconds to wait per component for pre-compilation warm-up
-  - name: cold-start-unknown-small
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 20]
-    length: [2]
-    disableCompilationCache: true
-
-  - name: cold-start-unknown-medium
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 20]
-    length: [5]
-    disableCompilationCache: true
-
-  # Cold-start: compilation cache enabled — measures latency once the compiled
-  # artefact is available in the cache.
-  # size   = number of unique components created (each in its own env)
-  # length = seconds to wait per component for pre-compilation warm-up
-  # NOTE: if results here are close to the cache-disabled entries above, the
-  # warm-up wait is too short and compilation hasn't finished — bump length.
-  - name: cold-start-unknown-small
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 20]
-    length: [2]
-
-  - name: cold-start-unknown-medium
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 20]
-    length: [5]
-
-  # Invocation latency — hot and cold paths through the Gateway NLB.
-  # Large worker counts to stress the load balancer and connection pool.
-  # size   = number of workers created
-  # length = number of hot invocations per worker after the first cold one
-  - name: latency-small
-    iterations: 3
-    clusterSize: [2]
-    size: [100, 500, 1000, 2000, 5000]
-    length: [2]
-
-  - name: latency-medium
-    iterations: 3
-    clusterSize: [2]
-    size: [100, 500, 1000, 2000]
-    length: [5]
-
-  # Sleep — measures worker suspension and resumption under real network
-  # conditions.
-  # size   = number of workers launched in parallel
-  # length = sleep duration in milliseconds
-  - name: sleep
-    iterations: 3
-    clusterSize: [2]
-    size: [10, 100, 500, 1000]
-    length: [10000]
-
-  # Durability overhead — measures the cost of durable vs ephemeral execution
-  # across four variants (durable-persistent, durable-non-persistent,
-  # ephemeral, durable-persistent-commit).
-  # size   = number of workers per variant
-  # length = loop iteration count passed to oplog_heavy
-  - name: durability-overhead
-    iterations: 3
-    clusterSize: [2]
-    size: [10, 50, 100, 200]
-    length: [5000]
+  # TEMPORARILY DISABLED: only throughput-echo runs for the memory-semaphore
+  # (component_size_coefficient) knob experiment, to get faster A/B results.
+  # Restore the entries below to run the full suite again.
+  #
+  # # size   = number of workers per implementation
+  # # length = payload size in bytes sent to large_input
+  # - name: throughput-large-input
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 10, 25, 50]
+  #   length: [100, 10000]
+  #
+  # # size   = number of workers per implementation
+  # # length = CPU work length passed to cpu_intensive
+  # - name: throughput-cpu-intensive
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 10, 25, 50]
+  #   length: [100]
+  #
+  # # Cold-start: compilation cache disabled — measures true cold-start latency
+  # # with no warm compiled artefact available.
+  # # size   = number of unique components created (each in its own env)
+  # # length = seconds to wait per component for pre-compilation warm-up
+  # - name: cold-start-unknown-small
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 20]
+  #   length: [2]
+  #   disableCompilationCache: true
+  #
+  # - name: cold-start-unknown-medium
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 20]
+  #   length: [5]
+  #   disableCompilationCache: true
+  #
+  # # Cold-start: compilation cache enabled — measures latency once the compiled
+  # # artefact is available in the cache.
+  # # size   = number of unique components created (each in its own env)
+  # # length = seconds to wait per component for pre-compilation warm-up
+  # # NOTE: if results here are close to the cache-disabled entries above, the
+  # # warm-up wait is too short and compilation hasn't finished — bump length.
+  # - name: cold-start-unknown-small
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 20]
+  #   length: [2]
+  #
+  # - name: cold-start-unknown-medium
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 20]
+  #   length: [5]
+  #
+  # # Invocation latency — hot and cold paths through the Gateway NLB.
+  # # Large worker counts to stress the load balancer and connection pool.
+  # # size   = number of workers created
+  # # length = number of hot invocations per worker after the first cold one
+  # - name: latency-small
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [100, 500, 1000, 2000, 5000]
+  #   length: [2]
+  #
+  # - name: latency-medium
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [100, 500, 1000, 2000]
+  #   length: [5]
+  #
+  # # Sleep — measures worker suspension and resumption under real network
+  # # conditions.
+  # # size   = number of workers launched in parallel
+  # # length = sleep duration in milliseconds
+  # - name: sleep
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [10, 100, 500, 1000]
+  #   length: [10000]
+  #
+  # # Durability overhead — measures the cost of durable vs ephemeral execution
+  # # across four variants (durable-persistent, durable-non-persistent,
+  # # ephemeral, durable-persistent-commit).
+  # # size   = number of workers per variant
+  # # length = loop iteration count passed to oplog_heavy
+  # - name: durability-overhead
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [10, 50, 100, 200]
+  #   length: [5000]

From faeb65149bdc0552b15f8eaf53d1eb1fb389324d Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:50:44 -0700
Subject: [PATCH 12/60] feat(bench): try 200 apps after tuning

---
 integration-tests/benchmark_suites/cloud-perf.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index 9cb7bded85..bcd4faaa6d 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -35,7 +35,7 @@ benchmarks:
   - name: throughput-echo
     iterations: 3
     clusterSize: [2]
-    size: [1, 10, 50, 100]
+    size: [1, 10, 50, 100, 200]
     length: [1000]
 
   # TEMPORARILY DISABLED: only throughput-echo runs for the memory-semaphore

From f8dd565f6049538247e52edab148ab69f174127e Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sat, 6 Jun 2026 01:04:56 -0700
Subject: [PATCH 13/60] feat: try 250 again

---
 integration-tests/benchmark_suites/cloud-perf.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index bcd4faaa6d..d4ff6e3b23 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -35,7 +35,7 @@ benchmarks:
   - name: throughput-echo
     iterations: 3
     clusterSize: [2]
-    size: [1, 10, 50, 100, 200]
+    size: [1, 10, 50, 100, 250]
     length: [1000]
 
   # TEMPORARILY DISABLED: only throughput-echo runs for the memory-semaphore

From 1bf006314d5210c6d9434209d338be02c865da03 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sat, 6 Jun 2026 21:52:55 -0700
Subject: [PATCH 14/60] feat(benchmark): run all the tests again

---
 .../benchmark_suites/cloud-perf.yaml          | 184 +++++++++---------
 .../src/benchmarks/throughput.rs              |  47 ++++-
 2 files changed, 134 insertions(+), 97 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index d4ff6e3b23..6e258c3c99 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -20,10 +20,6 @@
 # RPC worker pairs and HTTP deployments — the most complex setup. Running them
 # early surfaces infrastructure issues (stuck workers, port-forward drops)
 # before spending time on the simpler benchmarks.
-#
-# NOTE: All benchmarks except throughput-echo are temporarily commented out for
-# the memory-semaphore (component_size_coefficient) knob experiment. Restore
-# them to run the full suite.
 
 name: cloud-perf
 benchmarks:
@@ -38,95 +34,91 @@ benchmarks:
     size: [1, 10, 50, 100, 250]
     length: [1000]
 
-  # TEMPORARILY DISABLED: only throughput-echo runs for the memory-semaphore
-  # (component_size_coefficient) knob experiment, to get faster A/B results.
-  # Restore the entries below to run the full suite again.
-  #
-  # # size   = number of workers per implementation
-  # # length = payload size in bytes sent to large_input
-  # - name: throughput-large-input
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 10, 25, 50]
-  #   length: [100, 10000]
-  #
-  # # size   = number of workers per implementation
-  # # length = CPU work length passed to cpu_intensive
-  # - name: throughput-cpu-intensive
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 10, 25, 50]
-  #   length: [100]
-  #
-  # # Cold-start: compilation cache disabled — measures true cold-start latency
-  # # with no warm compiled artefact available.
-  # # size   = number of unique components created (each in its own env)
-  # # length = seconds to wait per component for pre-compilation warm-up
-  # - name: cold-start-unknown-small
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 20]
-  #   length: [2]
-  #   disableCompilationCache: true
-  #
-  # - name: cold-start-unknown-medium
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 20]
-  #   length: [5]
-  #   disableCompilationCache: true
-  #
-  # # Cold-start: compilation cache enabled — measures latency once the compiled
-  # # artefact is available in the cache.
-  # # size   = number of unique components created (each in its own env)
-  # # length = seconds to wait per component for pre-compilation warm-up
-  # # NOTE: if results here are close to the cache-disabled entries above, the
-  # # warm-up wait is too short and compilation hasn't finished — bump length.
-  # - name: cold-start-unknown-small
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 20]
-  #   length: [2]
-  #
-  # - name: cold-start-unknown-medium
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 20]
-  #   length: [5]
-  #
-  # # Invocation latency — hot and cold paths through the Gateway NLB.
-  # # Large worker counts to stress the load balancer and connection pool.
-  # # size   = number of workers created
-  # # length = number of hot invocations per worker after the first cold one
-  # - name: latency-small
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [100, 500, 1000, 2000, 5000]
-  #   length: [2]
-  #
-  # - name: latency-medium
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [100, 500, 1000, 2000]
-  #   length: [5]
-  #
-  # # Sleep — measures worker suspension and resumption under real network
-  # # conditions.
-  # # size   = number of workers launched in parallel
-  # # length = sleep duration in milliseconds
-  # - name: sleep
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [10, 100, 500, 1000]
-  #   length: [10000]
-  #
-  # # Durability overhead — measures the cost of durable vs ephemeral execution
-  # # across four variants (durable-persistent, durable-non-persistent,
-  # # ephemeral, durable-persistent-commit).
-  # # size   = number of workers per variant
-  # # length = loop iteration count passed to oplog_heavy
-  # - name: durability-overhead
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [10, 50, 100, 200]
-  #   length: [5000]
+  # size   = number of workers per implementation
+  # length = payload size in bytes sent to large_input
+  - name: throughput-large-input
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 10, 25, 50]
+    length: [100, 10000]
+
+  # size   = number of workers per implementation
+  # length = CPU work length passed to cpu_intensive
+  - name: throughput-cpu-intensive
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 10, 25, 50]
+    length: [100]
+
+  # Cold-start: compilation cache disabled — measures true cold-start latency
+  # with no warm compiled artefact available.
+  # size   = number of unique components created (each in its own env)
+  # length = seconds to wait per component for pre-compilation warm-up
+  - name: cold-start-unknown-small
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [2]
+    disableCompilationCache: true
+
+  - name: cold-start-unknown-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [5]
+    disableCompilationCache: true
+
+  # Cold-start: compilation cache enabled — measures latency once the compiled
+  # artefact is available in the cache.
+  # size   = number of unique components created (each in its own env)
+  # length = seconds to wait per component for pre-compilation warm-up
+  # NOTE: if results here are close to the cache-disabled entries above, the
+  # warm-up wait is too short and compilation hasn't finished — bump length.
+  - name: cold-start-unknown-small
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [2]
+
+  - name: cold-start-unknown-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [5]
+
+  # Invocation latency — hot and cold paths through the Gateway NLB.
+  # Large worker counts to stress the load balancer and connection pool.
+  # size   = number of workers created
+  # length = number of hot invocations per worker after the first cold one
+  - name: latency-small
+    iterations: 3
+    clusterSize: [2]
+    size: [100, 500, 1000, 2000, 5000]
+    length: [2]
+
+  - name: latency-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [100, 500, 1000, 2000]
+    length: [5]
+
+  # Sleep — measures worker suspension and resumption under real network
+  # conditions.
+  # size   = number of workers launched in parallel
+  # length = sleep duration in milliseconds
+  - name: sleep
+    iterations: 3
+    clusterSize: [2]
+    size: [10, 100, 500, 1000]
+    length: [10000]
+
+  # Durability overhead — measures the cost of durable vs ephemeral execution
+  # across four variants (durable-persistent, durable-non-persistent,
+  # ephemeral, durable-persistent-commit).
+  # size   = number of workers per variant
+  # length = loop iteration count passed to oplog_heavy
+  - name: durability-overhead
+    iterations: 3
+    clusterSize: [2]
+    size: [10, 50, 100, 200]
+    length: [5000]
diff --git a/integration-tests/src/benchmarks/throughput.rs b/integration-tests/src/benchmarks/throughput.rs
index 5515090847..f3552e0eee 100644
--- a/integration-tests/src/benchmarks/throughput.rs
+++ b/integration-tests/src/benchmarks/throughput.rs
@@ -29,7 +29,7 @@ use golem_common::model::http_api_deployment::{
 };
 use golem_common::model::{AgentId, RoutingTable};
 use golem_common::{agent_id, data_value};
-use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig};
+use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, ResultKey, RunConfig};
 use golem_test_framework::config::benchmark::TestMode;
 use golem_test_framework::config::dsl_impl::TestUserContext;
 use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies};
@@ -38,6 +38,7 @@ use indoc::indoc;
 use reqwest::{Body, Method, Request, Url};
 use serde_json::json;
 use std::collections::BTreeMap;
+use std::time::Instant;
 use tracing::{Instrument, Level, info};
 
 pub struct ThroughputEcho {
@@ -460,6 +461,31 @@ fn agent_ids_to_agent_ids(component_id: ComponentId, ids: &[ParsedAgentId]) -> V
         .collect()
 }
 
+/// Records aggregate throughput (invocations per second) for a measurement
+/// block as a `count` result under the key `{prefix}throughput-ops-per-sec`.
+///
+/// `total_calls` is the total number of invocations issued across all targets
+/// in the block; `elapsed` is the wall-clock duration of the concurrently
+/// executed block. Throughput is therefore the realised aggregate rate the
+/// cluster sustained for this implementation, not a per-call latency.
+fn record_throughput(
+    recorder: &BenchmarkRecorder,
+    prefix: &str,
+    total_calls: usize,
+    elapsed: std::time::Duration,
+) {
+    let secs = elapsed.as_secs_f64();
+    if secs <= 0.0 || total_calls == 0 {
+        return;
+    }
+    let ops_per_sec = (total_calls as f64 / secs).round() as u64;
+    info!("{prefix}throughput: {total_calls} calls in {secs:.3}s = {ops_per_sec} ops/sec");
+    recorder.count(
+        &ResultKey::primary(format!("{prefix}throughput-ops-per-sec")),
+        ops_per_sec,
+    );
+}
+
 impl ThroughputBenchmark {
     pub async fn new(
         rust_method_name: &str,
@@ -796,7 +822,10 @@ impl ThroughputBenchmark {
                 })
                 .collect::<Vec<_>>();
 
+            let started = Instant::now();
             let results = result_futures.join().await;
+            let elapsed = started.elapsed();
+            record_throughput(recorder, prefix, targets.len() * call_count, elapsed);
             for (idx, (results, target)) in results.iter().zip(targets).enumerate() {
                 let prefix = target.prefix(prefix, routing_table);
                 for result in results {
@@ -903,7 +932,15 @@ impl ThroughputBenchmark {
                 })
                 .collect::<Vec<_>>();
 
+            let started = Instant::now();
             let results = result_futures.join().await;
+            let elapsed = started.elapsed();
+            record_throughput(
+                &recorder,
+                "rust-agent-http-",
+                iteration.rust_agent_ids_for_http.len() * self.call_count,
+                elapsed,
+            );
             for (idx, results) in results.iter().enumerate() {
                 for result in results {
                     result.record(&recorder, "rust-agent-http-", idx.to_string().as_str());
@@ -936,7 +973,15 @@ impl ThroughputBenchmark {
                 })
                 .collect::<Vec<_>>();
 
+            let started = Instant::now();
             let results = result_futures.join().await;
+            let elapsed = started.elapsed();
+            record_throughput(
+                &recorder,
+                "ts-agent-http-",
+                iteration.ts_agent_ids_for_http.len() * self.call_count,
+                elapsed,
+            );
             for (idx, results) in results.iter().enumerate() {
                 for result in results {
                     result.record(&recorder, "ts-agent-http-", idx.to_string().as_str());

From 2e53af6bb401a26f17caf042cde010bc381be32e Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sat, 6 Jun 2026 23:34:22 -0700
Subject: [PATCH 15/60] fix: metric description

---
 golem-worker-executor/src/metrics.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs
index c9d610e79d..de6d673632 100644
--- a/golem-worker-executor/src/metrics.rs
+++ b/golem-worker-executor/src/metrics.rs
@@ -518,7 +518,7 @@ pub mod wasm {
         .unwrap();
         static ref WORKER_RESIDENT_LINEAR_MEMORY_BYTES: Histogram = register_histogram!(
             "worker_resident_linear_memory_bytes",
-            "Per-worker cumulative linear memory size (total_linear_memory_size) observed when acquiring a memory permit",
+            "Per-worker cumulative linear-memory ceiling (total_linear_memory_size = sum of memory.grow deltas) sampled at permit acquire. This is the semaphore charge basis (x*ml), an upper bound on resident RSS, NOT measured resident memory (grown pages are largely demand-paged); compare to container_memory_working_set_bytes for the gap",
             crate::metrics::MEMORY_SIZE_BUCKETS.to_vec()
         )
         .unwrap();

From 32ef9e59a1eb5edfd0f96c2d038c32a8c7c20c8b Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sun, 7 Jun 2026 00:23:48 -0700
Subject: [PATCH 16/60] feat: proper load for our cluster

---
 .../benchmark_suites/cloud-perf.yaml          | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index 6e258c3c99..088e5c43f7 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -36,10 +36,13 @@ benchmarks:
 
   # size   = number of workers per implementation
   # length = payload size in bytes sent to large_input
+  # NOTE: large payloads grow worker linear memory, so this is the throughput
+  # benchmark most relevant to the memory-admission investigation — sized to
+  # match throughput-echo so it exercises real density.
   - name: throughput-large-input
     iterations: 3
     clusterSize: [2]
-    size: [1, 10, 25, 50]
+    size: [1, 10, 50, 100, 250]
     length: [100, 10000]
 
   # size   = number of workers per implementation
@@ -47,13 +50,18 @@ benchmarks:
   - name: throughput-cpu-intensive
     iterations: 3
     clusterSize: [2]
-    size: [1, 10, 25, 50]
+    size: [1, 10, 50, 100, 250]
     length: [100]
 
   # Cold-start: compilation cache disabled — measures true cold-start latency
   # with no warm compiled artefact available.
   # size   = number of unique components created (each in its own env)
   # length = seconds to wait per component for pre-compilation warm-up
+  # NOTE: each unit here is a UNIQUE component, so size scales compilations
+  # (compilation-service + S3 cache load), not worker density. Kept at max 20:
+  # cold-start latency is already characterized there, and the cache-enabled
+  # warmup sleeps length*size seconds per run, so larger sizes mostly add idle
+  # wait rather than signal.
   - name: cold-start-unknown-small
     iterations: 3
     clusterSize: [2]
@@ -103,22 +111,25 @@ benchmarks:
     length: [5]
 
   # Sleep — measures worker suspension and resumption under real network
-  # conditions.
+  # conditions. High residency: all `size` workers held in memory sleeping at
+  # once, so this also probes how many resident workers fit (memory-admission
+  # relevant) — pushed past the ~2000 echo proved out.
   # size   = number of workers launched in parallel
   # length = sleep duration in milliseconds
   - name: sleep
     iterations: 3
     clusterSize: [2]
-    size: [10, 100, 500, 1000]
+    size: [10, 100, 500, 1000, 2000]
     length: [10000]
 
   # Durability overhead — measures the cost of durable vs ephemeral execution
   # across four variants (durable-persistent, durable-non-persistent,
-  # ephemeral, durable-persistent-commit).
+  # ephemeral, durable-persistent-commit). size workers concurrent per phase;
+  # sized up to put real load on the oplog/persistence/storage path.
   # size   = number of workers per variant
   # length = loop iteration count passed to oplog_heavy
   - name: durability-overhead
     iterations: 3
     clusterSize: [2]
-    size: [10, 50, 100, 200]
+    size: [10, 50, 100, 250]
     length: [5000]

From bc117799dad080b28bfbe55467f42a3233e3d257 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sun, 7 Jun 2026 21:00:41 -0700
Subject: [PATCH 17/60] feat(benchmark): run only benchmark tests

---
 .../benchmark_suites/cloud-perf.yaml          | 152 +++++++++---------
 1 file changed, 76 insertions(+), 76 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index 088e5c43f7..f7aab7e9f4 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -31,7 +31,7 @@ benchmarks:
   - name: throughput-echo
     iterations: 3
     clusterSize: [2]
-    size: [1, 10, 50, 100, 250]
+    size: [1, 50, 100, 250]
     length: [1000]
 
   # size   = number of workers per implementation
@@ -42,7 +42,7 @@ benchmarks:
   - name: throughput-large-input
     iterations: 3
     clusterSize: [2]
-    size: [1, 10, 50, 100, 250]
+    size: [1, 50, 100, 250]
     length: [100, 10000]
 
   # size   = number of workers per implementation
@@ -50,86 +50,86 @@ benchmarks:
   - name: throughput-cpu-intensive
     iterations: 3
     clusterSize: [2]
-    size: [1, 10, 50, 100, 250]
+    size: [1, 50, 100, 250]
     length: [100]
 
-  # Cold-start: compilation cache disabled — measures true cold-start latency
-  # with no warm compiled artefact available.
-  # size   = number of unique components created (each in its own env)
-  # length = seconds to wait per component for pre-compilation warm-up
-  # NOTE: each unit here is a UNIQUE component, so size scales compilations
-  # (compilation-service + S3 cache load), not worker density. Kept at max 20:
-  # cold-start latency is already characterized there, and the cache-enabled
-  # warmup sleeps length*size seconds per run, so larger sizes mostly add idle
-  # wait rather than signal.
-  - name: cold-start-unknown-small
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 20]
-    length: [2]
-    disableCompilationCache: true
+  # # Cold-start: compilation cache disabled — measures true cold-start latency
+  # # with no warm compiled artefact available.
+  # # size   = number of unique components created (each in its own env)
+  # # length = seconds to wait per component for pre-compilation warm-up
+  # # NOTE: each unit here is a UNIQUE component, so size scales compilations
+  # # (compilation-service + S3 cache load), not worker density. Kept at max 20:
+  # # cold-start latency is already characterized there, and the cache-enabled
+  # # warmup sleeps length*size seconds per run, so larger sizes mostly add idle
+  # # wait rather than signal.
+  # - name: cold-start-unknown-small
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 20]
+  #   length: [2]
+  #   disableCompilationCache: true
 
-  - name: cold-start-unknown-medium
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 20]
-    length: [5]
-    disableCompilationCache: true
+  # - name: cold-start-unknown-medium
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 20]
+  #   length: [5]
+  #   disableCompilationCache: true
 
-  # Cold-start: compilation cache enabled — measures latency once the compiled
-  # artefact is available in the cache.
-  # size   = number of unique components created (each in its own env)
-  # length = seconds to wait per component for pre-compilation warm-up
-  # NOTE: if results here are close to the cache-disabled entries above, the
-  # warm-up wait is too short and compilation hasn't finished — bump length.
-  - name: cold-start-unknown-small
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 20]
-    length: [2]
+  # # Cold-start: compilation cache enabled — measures latency once the compiled
+  # # artefact is available in the cache.
+  # # size   = number of unique components created (each in its own env)
+  # # length = seconds to wait per component for pre-compilation warm-up
+  # # NOTE: if results here are close to the cache-disabled entries above, the
+  # # warm-up wait is too short and compilation hasn't finished — bump length.
+  # - name: cold-start-unknown-small
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 20]
+  #   length: [2]
 
-  - name: cold-start-unknown-medium
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 20]
-    length: [5]
+  # - name: cold-start-unknown-medium
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 20]
+  #   length: [5]
 
-  # Invocation latency — hot and cold paths through the Gateway NLB.
-  # Large worker counts to stress the load balancer and connection pool.
-  # size   = number of workers created
-  # length = number of hot invocations per worker after the first cold one
-  - name: latency-small
-    iterations: 3
-    clusterSize: [2]
-    size: [100, 500, 1000, 2000, 5000]
-    length: [2]
+  # # Invocation latency — hot and cold paths through the Gateway NLB.
+  # # Large worker counts to stress the load balancer and connection pool.
+  # # size   = number of workers created
+  # # length = number of hot invocations per worker after the first cold one
+  # - name: latency-small
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [100, 500, 1000, 2000, 5000]
+  #   length: [2]
 
-  - name: latency-medium
-    iterations: 3
-    clusterSize: [2]
-    size: [100, 500, 1000, 2000]
-    length: [5]
+  # - name: latency-medium
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [100, 500, 1000, 2000]
+  #   length: [5]
 
-  # Sleep — measures worker suspension and resumption under real network
-  # conditions. High residency: all `size` workers held in memory sleeping at
-  # once, so this also probes how many resident workers fit (memory-admission
-  # relevant) — pushed past the ~2000 echo proved out.
-  # size   = number of workers launched in parallel
-  # length = sleep duration in milliseconds
-  - name: sleep
-    iterations: 3
-    clusterSize: [2]
-    size: [10, 100, 500, 1000, 2000]
-    length: [10000]
+  # # Sleep — measures worker suspension and resumption under real network
+  # # conditions. High residency: all `size` workers held in memory sleeping at
+  # # once, so this also probes how many resident workers fit (memory-admission
+  # # relevant) — pushed past the ~2000 echo proved out.
+  # # size   = number of workers launched in parallel
+  # # length = sleep duration in milliseconds
+  # - name: sleep
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [10, 100, 500, 1000, 2000]
+  #   length: [10000]
 
-  # Durability overhead — measures the cost of durable vs ephemeral execution
-  # across four variants (durable-persistent, durable-non-persistent,
-  # ephemeral, durable-persistent-commit). size workers concurrent per phase;
-  # sized up to put real load on the oplog/persistence/storage path.
-  # size   = number of workers per variant
-  # length = loop iteration count passed to oplog_heavy
-  - name: durability-overhead
-    iterations: 3
-    clusterSize: [2]
-    size: [10, 50, 100, 250]
-    length: [5000]
+  # # Durability overhead — measures the cost of durable vs ephemeral execution
+  # # across four variants (durable-persistent, durable-non-persistent,
+  # # ephemeral, durable-persistent-commit). size workers concurrent per phase;
+  # # sized up to put real load on the oplog/persistence/storage path.
+  # # size   = number of workers per variant
+  # # length = loop iteration count passed to oplog_heavy
+  # - name: durability-overhead
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [10, 50, 100, 250]
+  #   length: [5000]

From 534762695f6ba8a8288f4c6c10c6acc237de69bd Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sun, 7 Jun 2026 21:04:21 -0700
Subject: [PATCH 18/60] feat: enable all tests again

---
 .../benchmark_suites/cloud-perf.yaml          | 146 +++++++++---------
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index f7aab7e9f4..81ff01bfc5 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -53,83 +53,83 @@ benchmarks:
     size: [1, 50, 100, 250]
     length: [100]
 
-  # # Cold-start: compilation cache disabled — measures true cold-start latency
-  # # with no warm compiled artefact available.
-  # # size   = number of unique components created (each in its own env)
-  # # length = seconds to wait per component for pre-compilation warm-up
-  # # NOTE: each unit here is a UNIQUE component, so size scales compilations
-  # # (compilation-service + S3 cache load), not worker density. Kept at max 20:
-  # # cold-start latency is already characterized there, and the cache-enabled
-  # # warmup sleeps length*size seconds per run, so larger sizes mostly add idle
-  # # wait rather than signal.
-  # - name: cold-start-unknown-small
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 20]
-  #   length: [2]
-  #   disableCompilationCache: true
+  # Cold-start: compilation cache disabled — measures true cold-start latency
+  # with no warm compiled artefact available.
+  # size   = number of unique components created (each in its own env)
+  # length = seconds to wait per component for pre-compilation warm-up
+  # NOTE: each unit here is a UNIQUE component, so size scales compilations
+  # (compilation-service + S3 cache load), not worker density. Kept at max 20:
+  # cold-start latency is already characterized there, and the cache-enabled
+  # warmup sleeps length*size seconds per run, so larger sizes mostly add idle
+  # wait rather than signal.
+  - name: cold-start-unknown-small
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [2]
+    disableCompilationCache: true
 
-  # - name: cold-start-unknown-medium
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 20]
-  #   length: [5]
-  #   disableCompilationCache: true
+  - name: cold-start-unknown-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [5]
+    disableCompilationCache: true
 
-  # # Cold-start: compilation cache enabled — measures latency once the compiled
-  # # artefact is available in the cache.
-  # # size   = number of unique components created (each in its own env)
-  # # length = seconds to wait per component for pre-compilation warm-up
-  # # NOTE: if results here are close to the cache-disabled entries above, the
-  # # warm-up wait is too short and compilation hasn't finished — bump length.
-  # - name: cold-start-unknown-small
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 20]
-  #   length: [2]
+  # Cold-start: compilation cache enabled — measures latency once the compiled
+  # artefact is available in the cache.
+  # size   = number of unique components created (each in its own env)
+  # length = seconds to wait per component for pre-compilation warm-up
+  # NOTE: if results here are close to the cache-disabled entries above, the
+  # warm-up wait is too short and compilation hasn't finished — bump length.
+  - name: cold-start-unknown-small
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [2]
 
-  # - name: cold-start-unknown-medium
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 20]
-  #   length: [5]
+  - name: cold-start-unknown-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 10, 20]
+    length: [5]
 
-  # # Invocation latency — hot and cold paths through the Gateway NLB.
-  # # Large worker counts to stress the load balancer and connection pool.
-  # # size   = number of workers created
-  # # length = number of hot invocations per worker after the first cold one
-  # - name: latency-small
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [100, 500, 1000, 2000, 5000]
-  #   length: [2]
+  # Invocation latency — hot and cold paths through the Gateway NLB.
+  # Large worker counts to stress the load balancer and connection pool.
+  # size   = number of workers created
+  # length = number of hot invocations per worker after the first cold one
+  - name: latency-small
+    iterations: 3
+    clusterSize: [2]
+    size: [100, 500, 1000, 2000, 5000]
+    length: [2]
 
-  # - name: latency-medium
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [100, 500, 1000, 2000]
-  #   length: [5]
+  - name: latency-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [100, 500, 1000, 2000]
+    length: [5]
 
-  # # Sleep — measures worker suspension and resumption under real network
-  # # conditions. High residency: all `size` workers held in memory sleeping at
-  # # once, so this also probes how many resident workers fit (memory-admission
-  # # relevant) — pushed past the ~2000 echo proved out.
-  # # size   = number of workers launched in parallel
-  # # length = sleep duration in milliseconds
-  # - name: sleep
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [10, 100, 500, 1000, 2000]
-  #   length: [10000]
+  # Sleep — measures worker suspension and resumption under real network
+  # conditions. High residency: all `size` workers held in memory sleeping at
+  # once, so this also probes how many resident workers fit (memory-admission
+  # relevant) — pushed past the ~2000 echo proved out.
+  # size   = number of workers launched in parallel
+  # length = sleep duration in milliseconds
+  - name: sleep
+    iterations: 3
+    clusterSize: [2]
+    size: [10, 100, 500, 1000, 2000]
+    length: [10000]
 
-  # # Durability overhead — measures the cost of durable vs ephemeral execution
-  # # across four variants (durable-persistent, durable-non-persistent,
-  # # ephemeral, durable-persistent-commit). size workers concurrent per phase;
-  # # sized up to put real load on the oplog/persistence/storage path.
-  # # size   = number of workers per variant
-  # # length = loop iteration count passed to oplog_heavy
-  # - name: durability-overhead
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [10, 50, 100, 250]
-  #   length: [5000]
+  # Durability overhead — measures the cost of durable vs ephemeral execution
+  # across four variants (durable-persistent, durable-non-persistent,
+  # ephemeral, durable-persistent-commit). size workers concurrent per phase;
+  # sized up to put real load on the oplog/persistence/storage path.
+  # size   = number of workers per variant
+  # length = loop iteration count passed to oplog_heavy
+  - name: durability-overhead
+    iterations: 3
+    clusterSize: [2]
+    size: [10, 50, 100, 250]
+    length: [5000]

From 9e582a2603cbbe1b09697a61f4d0e29ed08087a0 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sun, 7 Jun 2026 21:13:29 -0700
Subject: [PATCH 19/60] feat(benchmark): increase max number of concurrent
 compilations

---
 integration-tests/benchmark_suites/cloud-perf.yaml | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index 81ff01bfc5..d508c5b5fc 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -57,22 +57,17 @@ benchmarks:
   # with no warm compiled artefact available.
   # size   = number of unique components created (each in its own env)
   # length = seconds to wait per component for pre-compilation warm-up
-  # NOTE: each unit here is a UNIQUE component, so size scales compilations
-  # (compilation-service + S3 cache load), not worker density. Kept at max 20:
-  # cold-start latency is already characterized there, and the cache-enabled
-  # warmup sleeps length*size seconds per run, so larger sizes mostly add idle
-  # wait rather than signal.
   - name: cold-start-unknown-small
     iterations: 3
     clusterSize: [2]
-    size: [1, 5, 10, 20]
+    size: [1, 5, 10, 25, 50]
     length: [2]
     disableCompilationCache: true
 
   - name: cold-start-unknown-medium
     iterations: 3
     clusterSize: [2]
-    size: [1, 5, 10, 20]
+    size: [1, 5, 10, 25, 50]
     length: [5]
     disableCompilationCache: true
 
@@ -85,13 +80,13 @@ benchmarks:
   - name: cold-start-unknown-small
     iterations: 3
     clusterSize: [2]
-    size: [1, 5, 10, 20]
+    size: [1, 5, 10, 25, 50]
     length: [2]
 
   - name: cold-start-unknown-medium
     iterations: 3
     clusterSize: [2]
-    size: [1, 5, 10, 20]
+    size: [1, 5, 10, 25, 50]
     length: [5]
 
   # Invocation latency — hot and cold paths through the Gateway NLB.

From e7b44bf3e5e2d83a3df3418d72c38219654aa40b Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Mon, 8 Jun 2026 16:53:27 -0700
Subject: [PATCH 20/60] feat(worker-executor): add measured-headroom memory
 admission gate

---
 .../config/debug-worker-executor.sample.env   |   2 +
 .../config/debug-worker-executor.toml         |   2 +
 .../config/worker-executor.sample.env         |   3 +
 .../config/worker-executor.toml               |   3 +
 .../services/active_workers/admission/mod.rs  | 160 ++++++
 .../active_workers/admission/tests.rs         | 508 ++++++++++++++++++
 .../services/active_workers/memory_probe.rs   | 180 +++++++
 .../src/services/active_workers/mod.rs        | 190 ++++---
 .../src/services/golem_config.rs              |  21 +
 9 files changed, 1006 insertions(+), 63 deletions(-)
 create mode 100644 golem-worker-executor/src/services/active_workers/admission/mod.rs
 create mode 100644 golem-worker-executor/src/services/active_workers/admission/tests.rs
 create mode 100644 golem-worker-executor/src/services/active_workers/memory_probe.rs

diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env
index 66afafc82a..d717cf777a 100644
--- a/golem-debugging-service/config/debug-worker-executor.sample.env
+++ b/golem-debugging-service/config/debug-worker-executor.sample.env
@@ -55,6 +55,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
@@ -229,6 +230,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml
index 7191c36996..7d23b08cd5 100644
--- a/golem-debugging-service/config/debug-worker-executor.toml
+++ b/golem-debugging-service/config/debug-worker-executor.toml
@@ -96,6 +96,7 @@ max_oplog_query_pages_size = 100
 
 [memory]
 acquire_retry_delay = "500ms"
+admission_reserve_bytes = 268435456
 component_size_coefficient = 2.0
 worker_estimate_coefficient = 1.1
 worker_memory_ratio = 0.8
@@ -365,6 +366,7 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
+# admission_reserve_bytes = 268435456
 # component_size_coefficient = 2.0
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env
index 2ef7701cc5..dc33d7b3c1 100644
--- a/golem-worker-executor/config/worker-executor.sample.env
+++ b/golem-worker-executor/config/worker-executor.sample.env
@@ -72,6 +72,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
@@ -292,6 +293,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
@@ -482,6 +484,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml
index b1bab39be9..265ec5f904 100644
--- a/golem-worker-executor/config/worker-executor.toml
+++ b/golem-worker-executor/config/worker-executor.toml
@@ -125,6 +125,7 @@ max_oplog_query_pages_size = 100
 
 [memory]
 acquire_retry_delay = "500ms"
+admission_reserve_bytes = 268435456
 component_size_coefficient = 2.0
 worker_estimate_coefficient = 1.1
 worker_memory_ratio = 0.8
@@ -457,6 +458,7 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
+# admission_reserve_bytes = 268435456
 # component_size_coefficient = 2.0
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
@@ -759,6 +761,7 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
+# admission_reserve_bytes = 268435456
 # component_size_coefficient = 2.0
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs
new file mode 100644
index 0000000000..702dc003e7
--- /dev/null
+++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs
@@ -0,0 +1,160 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Measured-headroom admission decision.
+//!
+//! Gates worker admission on the executor environment's *real* memory headroom
+//! read from the [`MemoryProbe`], rather than on the estimate-based semaphore in
+//! [`super::ActiveWorkers`]. The two work together: the semaphore is a cheap,
+//! high-frequency pre-filter over reserved-but-not-yet-resident intent; this
+//! controller is the authoritative check against measured resident usage. When
+//! headroom is short it evicts already-resident idle-then-warm work; if it still
+//! cannot make room it rejects rather than over-committing.
+//!
+//! The controller is decoupled from `Worker`/wasmtime via the [`EvictionSource`]
+//! trait so its decision logic can be exercised in isolation with synthetic
+//! probes and candidate sets.
+
+use super::memory_probe::MemoryProbe;
+use async_trait::async_trait;
+
+/// Why an eviction candidate is worth evicting, in priority order. Lower
+/// variants are evicted first.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub enum EvictionPriority {
+    /// Resident in memory, not executing, no durable pending work. Cheapest to
+    /// evict — losing it costs at most a re-load on next use.
+    Idle,
+    /// Resident in memory, not executing, but has durable pending work. Evicted
+    /// only after all idle candidates are exhausted.
+    Warm,
+}
+
+/// A source of evictable, already-resident memory the controller can reclaim to
+/// restore headroom. Abstracts over the live worker set so the decision logic
+/// is testable without `Worker`/wasmtime.
+#[async_trait]
+pub trait EvictionSource: Send + Sync {
+    /// Evict at the given priority tier, attempting to free at least
+    /// `needed_bytes`. Returns the number of bytes actually reclaimed (which may
+    /// be less if the tier is exhausted, or more if a single victim was larger
+    /// than needed). Must not evict from a higher (more expensive) tier than the
+    /// one requested.
+    async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64;
+}
+
+/// The outcome of an admission attempt.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum AdmissionDecision {
+    /// There is enough real headroom (possibly after eviction) to admit the
+    /// request without risking the limit.
+    Admit,
+    /// Not enough headroom could be freed; the request must back off rather
+    /// than over-commit.
+    Reject,
+}
+
+/// Configuration for the headroom-based admission decision.
+///
+/// Two knobs with distinct jobs:
+///
+/// * `usable_ratio` — fraction of the measured limit usable for WASM admission.
+///   The remainder is left for the host (the executor process, allocator
+///   arenas, runtime buffers). Mirrors `worker_memory_ratio`, but applied to the
+///   measured limit rather than the configured total.
+///
+/// * `reserve_bytes` — margin kept free below the carve-out ceiling to absorb
+///   the window in which concurrent admissions are observed before becoming
+///   resident. Its sufficiency under concurrency is asserted by the property
+///   test in `tests.rs`.
+#[derive(Debug, Clone, Copy)]
+pub struct AdmissionPolicy {
+    /// Fraction (0.0..=1.0) of the measured limit usable for WASM admission.
+    pub usable_ratio: f64,
+    /// Dynamic safety margin kept free below the carve-out ceiling.
+    pub reserve_bytes: u64,
+}
+
+/// Decides admission against measured headroom, evicting resident idle/warm
+/// work as needed. Holds only its policy and probe; live state is read fresh
+/// from the probe and the eviction source on each call (never cached).
+pub struct AdmissionController {
+    probe: Box<dyn MemoryProbe>,
+    policy: AdmissionPolicy,
+}
+
+impl AdmissionController {
+    pub fn new(probe: Box<dyn MemoryProbe>, policy: AdmissionPolicy) -> Self {
+        Self { probe, policy }
+    }
+
+    /// Bytes available for new admissions: the carve-out ceiling
+    /// (`usable_ratio × limit`) minus current usage minus the reserve.
+    /// Saturating throughout — never underflows when already over a ceiling.
+    fn admissible_headroom(&self) -> u64 {
+        let snapshot = self.probe.snapshot();
+        let ceiling = (snapshot.limit_bytes as f64 * self.policy.usable_ratio) as u64;
+        ceiling
+            .saturating_sub(snapshot.current_bytes)
+            .saturating_sub(self.policy.reserve_bytes)
+    }
+
+    /// Decide whether `request_bytes` can be admitted, evicting from `source` if
+    /// the current headroom is insufficient.
+    ///
+    /// Eviction is attempted idle-first, then warm, and only up to the shortfall
+    /// (never evicts when headroom already suffices). After eviction the
+    /// headroom is re-measured against ground truth; the request is admitted only
+    /// if the real headroom now covers it, otherwise it is rejected.
+    pub async fn try_admit(
+        &self,
+        request_bytes: u64,
+        source: &dyn EvictionSource,
+    ) -> AdmissionDecision {
+        // Fast path: enough real headroom already, admit without evicting.
+        if self.admissible_headroom() >= request_bytes {
+            return AdmissionDecision::Admit;
+        }
+
+        // Reclaim resident, idle-then-warm work up to the shortfall.
+        let shortfall = request_bytes.saturating_sub(self.admissible_headroom());
+        let mut remaining = shortfall;
+
+        for priority in [EvictionPriority::Idle, EvictionPriority::Warm] {
+            if remaining == 0 {
+                break;
+            }
+            let freed = source.evict_at_most(priority, remaining).await;
+            remaining = remaining.saturating_sub(freed);
+        }
+
+        // Re-measure against ground truth rather than trusting the freed tally:
+        // the probe is the authority, and other activity may have moved usage
+        // in either direction while we were evicting.
+        if self.admissible_headroom() >= request_bytes {
+            AdmissionDecision::Admit
+        } else {
+            AdmissionDecision::Reject
+        }
+    }
+
+    /// The current admissible headroom. Exposed for metrics and for callers that
+    /// want to make their own pre-check.
+    pub fn headroom_bytes(&self) -> u64 {
+        self.admissible_headroom()
+    }
+}
+
+#[cfg(test)]
+mod tests;
diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs
new file mode 100644
index 0000000000..bd9b51aabb
--- /dev/null
+++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs
@@ -0,0 +1,508 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Property-based and example tests for the measured-headroom admission valve.
+//!
+//! These tests model an executor environment as a shared cell holding a hard
+//! `limit`, the current resident `usage`, and the set of resident evictable
+//! work (each item carrying a size and an eviction priority). A [`FakeProbe`]
+//! reports `usage`/`limit` from the cell; a [`FakeEvictionSource`] reclaims
+//! idle-then-warm items and decrements `usage`. Admitting a request adds its
+//! size to `usage` as a new resident, non-evictable item (it is actively being
+//! created).
+//!
+//! The model lets `proptest` drive thousands of random admit sequences — with
+//! random request sizes, pre-resident work, and limits — and assert the
+//! invariants that *define* a correct safety valve:
+//!
+//! 1. Safety: usage never exceeds the limit (the environment never OOMs).
+//! 2. No spurious eviction: when headroom is ample, nothing is evicted.
+//! 3. Eviction ordering: idle work is reclaimed before warm work.
+//! 4. Clean rejection: when the request genuinely cannot fit, the decision is
+//!    `Reject` and no over-commit happens.
+
+use super::*;
+use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot};
+use proptest::prelude::*;
+use std::sync::{Arc, Mutex};
+use test_r::test;
+
+test_r::enable!();
+
+/// One unit of resident, evictable work in the model.
+#[derive(Debug, Clone, Copy)]
+struct Resident {
+    size: u64,
+    priority: EvictionPriority,
+}
+
+/// Shared model of the executor environment's memory.
+#[derive(Debug, Default)]
+struct EnvState {
+    limit: u64,
+    /// Resident bytes attributed to admitted, currently-active requests that
+    /// are not yet evictable (they are mid-admission).
+    pinned_usage: u64,
+    /// Resident, evictable work — what the controller may reclaim.
+    residents: Vec<Resident>,
+    /// Count of evictions performed, for the no-spurious-eviction property.
+    evictions: usize,
+    /// The priorities evicted, in order, for the ordering property.
+    eviction_order: Vec<EvictionPriority>,
+}
+
+impl EnvState {
+    fn usage(&self) -> u64 {
+        self.pinned_usage + self.residents.iter().map(|r| r.size).sum::<u64>()
+    }
+}
+
+#[derive(Debug, Clone)]
+struct FakeProbe {
+    state: Arc<Mutex<EnvState>>,
+}
+
+impl MemoryProbe for FakeProbe {
+    fn snapshot(&self) -> MemorySnapshot {
+        let state = self.state.lock().unwrap();
+        MemorySnapshot {
+            limit_bytes: state.limit,
+            current_bytes: state.usage(),
+        }
+    }
+}
+
+struct FakeEvictionSource {
+    state: Arc<Mutex<EnvState>>,
+}
+
+#[async_trait::async_trait]
+impl EvictionSource for FakeEvictionSource {
+    async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64 {
+        let mut state = self.state.lock().unwrap();
+        let mut freed = 0u64;
+        // Evict only at the requested tier, oldest-first (model: vec order),
+        // until we have freed at least `needed_bytes` or the tier is empty.
+        let mut i = 0;
+        while freed < needed_bytes && i < state.residents.len() {
+            if state.residents[i].priority == priority {
+                let victim = state.residents.remove(i);
+                freed += victim.size;
+                state.evictions += 1;
+                state.eviction_order.push(priority);
+            } else {
+                i += 1;
+            }
+        }
+        freed
+    }
+}
+
+fn controller(state: Arc<Mutex<EnvState>>, reserve_bytes: u64) -> AdmissionController {
+    controller_with_ratio(state, 1.0, reserve_bytes)
+}
+
+fn controller_with_ratio(
+    state: Arc<Mutex<EnvState>>,
+    usable_ratio: f64,
+    reserve_bytes: u64,
+) -> AdmissionController {
+    AdmissionController::new(
+        Box::new(FakeProbe {
+            state: state.clone(),
+        }),
+        AdmissionPolicy {
+            usable_ratio,
+            reserve_bytes,
+        },
+    )
+}
+
+/// Apply one admission attempt against the model, mutating `usage` on admit.
+async fn apply_admit(
+    controller: &AdmissionController,
+    source: &FakeEvictionSource,
+    state: &Arc<Mutex<EnvState>>,
+    request: u64,
+) -> AdmissionDecision {
+    let decision = controller.try_admit(request, source).await;
+    if decision == AdmissionDecision::Admit {
+        state.lock().unwrap().pinned_usage += request;
+    }
+    decision
+}
+
+// ── Single-case unit tests ───────────────────────────────────────────────────
+
+#[test]
+async fn admits_when_headroom_is_ample_without_evicting() {
+    let state = Arc::new(Mutex::new(EnvState {
+        limit: 1000,
+        pinned_usage: 0,
+        residents: vec![Resident {
+            size: 100,
+            priority: EvictionPriority::Idle,
+        }],
+        ..Default::default()
+    }));
+    let ctrl = controller(state.clone(), 0);
+    let source = FakeEvictionSource {
+        state: state.clone(),
+    };
+
+    let decision = apply_admit(&ctrl, &source, &state, 200).await;
+    assert_eq!(decision, AdmissionDecision::Admit);
+    // Nothing should have been evicted — there was plenty of headroom.
+    assert_eq!(state.lock().unwrap().evictions, 0);
+}
+
+#[test]
+async fn evicts_idle_before_warm() {
+    let state = Arc::new(Mutex::new(EnvState {
+        limit: 1000,
+        pinned_usage: 0,
+        residents: vec![
+            Resident {
+                size: 400,
+                priority: EvictionPriority::Warm,
+            },
+            Resident {
+                size: 400,
+                priority: EvictionPriority::Idle,
+            },
+        ],
+        ..Default::default()
+    }));
+    // usage = 800, limit = 1000, headroom = 200. Request 300 → shortfall 100.
+    // One idle (400) covers it; warm must remain untouched.
+    let ctrl = controller(state.clone(), 0);
+    let source = FakeEvictionSource {
+        state: state.clone(),
+    };
+
+    let decision = apply_admit(&ctrl, &source, &state, 300).await;
+    assert_eq!(decision, AdmissionDecision::Admit);
+
+    let s = state.lock().unwrap();
+    assert_eq!(s.eviction_order, vec![EvictionPriority::Idle]);
+    assert!(s.usage() <= s.limit);
+}
+
+#[test]
+async fn rejects_when_nothing_can_be_freed() {
+    let state = Arc::new(Mutex::new(EnvState {
+        limit: 1000,
+        // All usage is pinned (mid-admission), nothing evictable.
+        pinned_usage: 950,
+        residents: vec![],
+        ..Default::default()
+    }));
+    let ctrl = controller(state.clone(), 0);
+    let source = FakeEvictionSource {
+        state: state.clone(),
+    };
+
+    let decision = apply_admit(&ctrl, &source, &state, 200).await;
+    assert_eq!(decision, AdmissionDecision::Reject);
+    // No over-commit: usage unchanged.
+    assert_eq!(state.lock().unwrap().usage(), 950);
+}
+
+#[test]
+async fn reserve_is_kept_free() {
+    let state = Arc::new(Mutex::new(EnvState {
+        limit: 1000,
+        pinned_usage: 700,
+        residents: vec![],
+        ..Default::default()
+    }));
+    // headroom = 300, reserve = 200 → admissible = 100. Request 150 → reject.
+    let ctrl = controller(state.clone(), 200);
+    let source = FakeEvictionSource {
+        state: state.clone(),
+    };
+
+    assert_eq!(
+        apply_admit(&ctrl, &source, &state, 150).await,
+        AdmissionDecision::Reject
+    );
+    // But a request within the admissible window succeeds.
+    assert_eq!(
+        apply_admit(&ctrl, &source, &state, 100).await,
+        AdmissionDecision::Admit
+    );
+}
+
+// ── Property tests ───────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone)]
+enum Op {
+    Admit(u64),
+}
+
+fn arb_resident_priority() -> impl Strategy<Value = EvictionPriority> {
+    prop_oneof![Just(EvictionPriority::Idle), Just(EvictionPriority::Warm)]
+}
+
+fn arb_ops() -> impl Strategy<Value = Vec<Op>> {
+    prop::collection::vec((1u64..800).prop_map(Op::Admit), 0..40)
+}
+
+/// Strategy yielding a `(limit, residents)` start state where the residents fit
+/// under the limit by construction, by carving each resident's size out of a
+/// remaining budget. A resident set exceeding the limit cannot occur in reality
+/// (it would already have been OOM-killed), so it is not a valid start state.
+fn arb_fitting_state(
+    limit_range: std::ops::Range<u64>,
+    max_residents: usize,
+) -> impl Strategy<Value = (u64, Vec<Resident>)> {
+    limit_range.prop_flat_map(move |limit| {
+        // Reserve a fraction of the limit for residents (0..=80%) so there is
+        // usually some free headroom in the start state too. Each resident then
+        // takes a slice of that budget.
+        (
+            Just(limit),
+            (0u64..=(limit * 4 / 5)),
+            prop::collection::vec((1u64..=1000, arb_resident_priority()), 0..max_residents),
+        )
+            .prop_map(|(limit, mut budget, raw)| {
+                let mut residents = Vec::new();
+                for (weight, priority) in raw {
+                    if budget == 0 {
+                        break;
+                    }
+                    // Each resident is at most a third of the remaining budget,
+                    // so several can coexist; clamp to whatever budget is left.
+                    let size = weight.min(budget.div_ceil(3)).max(1).min(budget);
+                    residents.push(Resident { size, priority });
+                    budget -= size;
+                }
+                (limit, residents)
+            })
+    })
+}
+
+proptest! {
+    /// Safety invariant: across any random sequence of admits — with random
+    /// pre-resident work, random sizes, and a random reserve — modeled usage
+    /// must never exceed the limit. This is the property that rules out OOM.
+    #[test]
+    fn usage_never_exceeds_limit(
+        (limit, residents) in arb_fitting_state(500..5000, 20),
+        reserve in 0u64..300,
+        ops in arb_ops(),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState {
+                limit,
+                pinned_usage: 0,
+                residents,
+                ..Default::default()
+            }));
+            let ctrl = controller(state.clone(), reserve);
+            let source = FakeEvictionSource { state: state.clone() };
+
+            for op in ops {
+                match op {
+                    Op::Admit(req) => {
+                        apply_admit(&ctrl, &source, &state, req).await;
+                        let s = state.lock().unwrap();
+                        prop_assert!(
+                            s.usage() <= s.limit,
+                            "usage {} exceeded limit {}", s.usage(), s.limit
+                        );
+                    }
+                }
+            }
+            Ok(())
+        }).unwrap();
+    }
+
+    /// No spurious eviction: if every admit in the sequence fits within the
+    /// admissible headroom at the moment it is issued, nothing is ever evicted.
+    /// We guarantee the precondition by giving a huge limit and small requests.
+    #[test]
+    fn no_eviction_when_headroom_ample(
+        residents in prop::collection::vec(
+            (1u64..500, arb_resident_priority())
+                .prop_map(|(size, priority)| Resident { size, priority }),
+            0..20,
+        ),
+        ops in prop::collection::vec((1u64..50).prop_map(Op::Admit), 0..30),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState {
+                limit: 1_000_000,
+                pinned_usage: 0,
+                residents,
+                ..Default::default()
+            }));
+            let ctrl = controller(state.clone(), 0);
+            let source = FakeEvictionSource { state: state.clone() };
+
+            for op in ops {
+                match op {
+                    Op::Admit(req) => { apply_admit(&ctrl, &source, &state, req).await; }
+                }
+            }
+            prop_assert_eq!(state.lock().unwrap().evictions, 0);
+            Ok(())
+        }).unwrap();
+    }
+
+    /// Eviction ordering: whenever eviction happens, no warm item is evicted
+    /// while an idle item was still available to evict at that step. We check
+    /// the weaker, order-level invariant that the recorded eviction order never
+    /// has a warm eviction before an idle one within a single `try_admit` call
+    /// — i.e. idle is always drained first.
+    #[test]
+    fn idle_evicted_before_warm(
+        (limit, residents) in arb_fitting_state(500..3000, 25),
+        ops in prop::collection::vec((1u64..1500).prop_map(Op::Admit), 1..20),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState {
+                limit,
+                pinned_usage: 0,
+                residents,
+                ..Default::default()
+            }));
+            let ctrl = controller(state.clone(), 0);
+            let source = FakeEvictionSource { state: state.clone() };
+
+            for op in ops {
+                match op {
+                    Op::Admit(req) => { apply_admit(&ctrl, &source, &state, req).await; }
+                }
+            }
+
+            // Once a warm eviction appears in the order, an idle eviction must
+            // never follow it (idle is always exhausted first).
+            let order = state.lock().unwrap().eviction_order.clone();
+            let mut seen_warm = false;
+            for p in order {
+                match p {
+                    EvictionPriority::Warm => seen_warm = true,
+                    EvictionPriority::Idle => prop_assert!(
+                        !seen_warm,
+                        "idle eviction followed a warm eviction"
+                    ),
+                }
+            }
+            Ok(())
+        }).unwrap();
+    }
+}
+
+// ── Carve-out ratio ──────────────────────────────────────────────────────────
+
+#[test]
+async fn usable_ratio_caps_admission_below_full_limit() {
+    let state = Arc::new(Mutex::new(EnvState {
+        limit: 1000,
+        pinned_usage: 0,
+        residents: vec![],
+        ..Default::default()
+    }));
+    // ceiling = 0.8 * 1000 = 800. Request 850 must be rejected even though the
+    // raw limit (1000) would allow it — the top 20% is reserved for the host.
+    let ctrl = controller_with_ratio(state.clone(), 0.8, 0);
+    let source = FakeEvictionSource {
+        state: state.clone(),
+    };
+
+    assert_eq!(
+        apply_admit(&ctrl, &source, &state, 850).await,
+        AdmissionDecision::Reject
+    );
+    assert_eq!(
+        apply_admit(&ctrl, &source, &state, 800).await,
+        AdmissionDecision::Admit
+    );
+}
+
+// ── Concurrency: the simultaneous-big-start race ─────────────────────────────
+
+proptest! {
+    /// The contract for the safety invariant under concurrency.
+    ///
+    /// Many admissions race at once with no external serialisation across the
+    /// headroom check and the commit (the commit models the upstream atomic
+    /// permit grant; the check is a separate prior read, so a genuine
+    /// time-of-check/time-of-use window exists between concurrent tasks).
+    ///
+    /// The invariant: real usage must never exceed the true `limit`. Admissions
+    /// may collectively overshoot the carve-out ceiling into the reserve — that
+    /// is what the reserve is for — but never past `limit` itself. The reserve
+    /// is sized here to cover the worst-case concurrent overshoot (number of
+    /// racers × max request), so a passing test means the reserve margin is a
+    /// sufficient substitute for serialising the gate. If this ever fails, the
+    /// margin is insufficient for the chosen concurrency and the gate's
+    /// correctness depends on stronger synchronisation.
+    #[test]
+    fn concurrent_admissions_never_exceed_limit(
+        racers in 2usize..16,
+        request in 50u64..400,
+    ) {
+        // Worst case: every racer passes the check against the same snapshot and
+        // commits. The reserve must cover (racers - 1) extra in-flight requests
+        // beyond the one the headroom was actually sized for.
+        let reserve = request * racers as u64;
+        // Ceiling must leave room for at least one request above the reserve.
+        let limit = reserve + request + 1000;
+
+        let rt = tokio::runtime::Builder::new_multi_thread()
+            .worker_threads(4)
+            .build()
+            .unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState {
+                limit,
+                pinned_usage: 0,
+                residents: vec![],
+                ..Default::default()
+            }));
+            let ctrl = Arc::new(controller_with_ratio(state.clone(), 1.0, reserve));
+
+            let mut handles = Vec::new();
+            for _ in 0..racers {
+                let ctrl = ctrl.clone();
+                let state = state.clone();
+                handles.push(tokio::spawn(async move {
+                    let source = FakeEvictionSource { state: state.clone() };
+                    let decision = ctrl.try_admit(request, &source).await;
+                    if decision == AdmissionDecision::Admit {
+                        // Models the atomic permit grant: a single locked
+                        // fetch-add, separate from the (already-completed) check.
+                        state.lock().unwrap().pinned_usage += request;
+                    }
+                }));
+            }
+            for h in handles {
+                h.await.unwrap();
+            }
+
+            let s = state.lock().unwrap();
+            prop_assert!(
+                s.usage() <= s.limit,
+                "concurrent admissions drove usage {} past limit {}",
+                s.usage(), s.limit
+            );
+            Ok(())
+        }).unwrap();
+    }
+}
diff --git a/golem-worker-executor/src/services/active_workers/memory_probe.rs b/golem-worker-executor/src/services/active_workers/memory_probe.rs
new file mode 100644
index 0000000000..0d1c4088a3
--- /dev/null
+++ b/golem-worker-executor/src/services/active_workers/memory_probe.rs
@@ -0,0 +1,180 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Platform-abstracted probe of the executor's real memory usage and limit.
+//!
+//! Reports the measured resident memory and hard limit of the process's
+//! environment, used as the authoritative input to admission decisions (in
+//! contrast to the estimate-based semaphore in [`super::ActiveWorkers`]).
+//!
+//! The trait is abstract over where the limit comes from: a containerised Linux
+//! deployment reads it from the cgroup, an unconstrained process reads host RAM,
+//! a configured override pins it explicitly. Backend fidelity is asymmetric —
+//! cgroup v2 gives the exact kernel-enforced number; other targets fall back to
+//! best-effort process RSS via [`ProcessRssProbe`] until dedicated macOS and
+//! Windows backends land.
+
+use std::fmt::Debug;
+
+/// A snapshot of the executor environment's memory state.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct MemorySnapshot {
+    /// Hard ceiling: cgroup `memory.max` on constrained Linux, configured cap
+    /// or host RAM otherwise. Reaching this with `current` triggers an
+    /// OOM-kill.
+    pub limit_bytes: u64,
+    /// Currently-resident bytes: cgroup `memory.current` on Linux (touched
+    /// pages, lagging but exact), process RSS otherwise.
+    pub current_bytes: u64,
+}
+
+impl MemorySnapshot {
+    /// Bytes between current usage and the hard limit. Saturating: never
+    /// underflows if `current` momentarily exceeds the reported `limit`.
+    pub fn headroom_bytes(&self) -> u64 {
+        self.limit_bytes.saturating_sub(self.current_bytes)
+    }
+}
+
+/// Reads the executor environment's real memory state. Cheap enough to sample
+/// at admission time, but not on every wasmtime `memory.grow` (that is what the
+/// estimate-semaphore pre-check absorbs).
+pub trait MemoryProbe: Send + Sync + Debug {
+    fn snapshot(&self) -> MemorySnapshot;
+
+    fn limit_bytes(&self) -> u64 {
+        self.snapshot().limit_bytes
+    }
+
+    fn current_bytes(&self) -> u64 {
+        self.snapshot().current_bytes
+    }
+
+    fn headroom_bytes(&self) -> u64 {
+        self.snapshot().headroom_bytes()
+    }
+}
+
+/// A probe whose limit is fixed at construction and whose current usage comes
+/// from cross-platform process RSS via `sysinfo`.
+///
+/// This is the best-effort fallback used wherever no higher-fidelity backend
+/// is available yet (notably macOS and Windows). It is also used when a
+/// `system_memory_override` pins the limit explicitly.
+#[derive(Debug)]
+pub struct ProcessRssProbe {
+    limit_bytes: u64,
+}
+
+impl ProcessRssProbe {
+    pub fn new(limit_bytes: u64) -> Self {
+        Self { limit_bytes }
+    }
+
+    fn current_rss() -> u64 {
+        let mut sysinfo = sysinfo::System::new();
+        let pid = sysinfo::Pid::from_u32(std::process::id());
+        sysinfo.refresh_processes(sysinfo::ProcessesToUpdate::Some(&[pid]), true);
+        sysinfo.process(pid).map(|p| p.memory()).unwrap_or_default()
+    }
+}
+
+impl MemoryProbe for ProcessRssProbe {
+    fn snapshot(&self) -> MemorySnapshot {
+        MemorySnapshot {
+            limit_bytes: self.limit_bytes,
+            current_bytes: Self::current_rss(),
+        }
+    }
+}
+
+/// Linux cgroup v2 probe. Reads `memory.max` and `memory.current` from the
+/// process's cgroup.
+#[cfg(target_os = "linux")]
+#[derive(Debug)]
+pub struct CgroupV2Probe {
+    /// Resolved path to the cgroup directory, e.g. `/sys/fs/cgroup`.
+    base: std::path::PathBuf,
+    /// Fallback limit used when `memory.max` reads `max` (unlimited) — usually
+    /// host RAM or the configured override.
+    fallback_limit_bytes: u64,
+}
+
+#[cfg(target_os = "linux")]
+impl CgroupV2Probe {
+    const DEFAULT_BASE: &'static str = "/sys/fs/cgroup";
+
+    /// Attempts to construct a cgroup v2 probe. Returns `None` when the host is
+    /// not running cgroup v2 (no unified `memory.current` at the base path), so
+    /// the caller can fall back to [`ProcessRssProbe`].
+    pub fn try_new(fallback_limit_bytes: u64) -> Option<Self> {
+        let base = std::path::PathBuf::from(Self::DEFAULT_BASE);
+        // cgroup v2 unified hierarchy exposes memory.current directly at the
+        // delegated cgroup path. If it is not readable we are not on v2.
+        if std::fs::read_to_string(base.join("memory.current")).is_ok() {
+            Some(Self {
+                base,
+                fallback_limit_bytes,
+            })
+        } else {
+            None
+        }
+    }
+
+    fn read_u64(&self, file: &str) -> Option<u64> {
+        let raw = std::fs::read_to_string(self.base.join(file)).ok()?;
+        raw.trim().parse::<u64>().ok()
+    }
+
+    fn read_limit(&self) -> u64 {
+        // memory.max contains either a number of bytes or the literal "max".
+        match std::fs::read_to_string(self.base.join("memory.max")) {
+            Ok(raw) => {
+                let trimmed = raw.trim();
+                if trimmed == "max" {
+                    self.fallback_limit_bytes
+                } else {
+                    trimmed.parse::<u64>().unwrap_or(self.fallback_limit_bytes)
+                }
+            }
+            Err(_) => self.fallback_limit_bytes,
+        }
+    }
+}
+
+#[cfg(target_os = "linux")]
+impl MemoryProbe for CgroupV2Probe {
+    fn snapshot(&self) -> MemorySnapshot {
+        MemorySnapshot {
+            limit_bytes: self.read_limit(),
+            current_bytes: self.read_u64("memory.current").unwrap_or(0),
+        }
+    }
+}
+
+/// Constructs the best available probe for the current platform.
+///
+/// On Linux, prefers cgroup v2; falls back to process RSS. On other targets,
+/// uses process RSS until dedicated backends land. `limit_bytes` is the limit
+/// to charge against and is also the fallback when the cgroup reports an
+/// unlimited `memory.max`.
+pub fn default_probe(limit_bytes: u64) -> Box<dyn MemoryProbe> {
+    #[cfg(target_os = "linux")]
+    {
+        if let Some(probe) = CgroupV2Probe::try_new(limit_bytes) {
+            return Box::new(probe);
+        }
+    }
+    Box::new(ProcessRssProbe::new(limit_bytes))
+}
diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs
index 3a9ece958b..00a3fc6f4d 100644
--- a/golem-worker-executor/src/services/active_workers/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/mod.rs
@@ -12,9 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+pub mod admission;
 pub mod concurrent_agents_scheduler;
 pub mod concurrent_agents_semaphore;
 pub mod fs_semaphore;
+pub mod memory_probe;
 #[cfg(test)]
 mod tests;
 
@@ -26,6 +28,9 @@ pub use fs_semaphore::{
     filesystem_storage_permits_to_bytes, filesystem_storage_pool_bytes_to_permits,
 };
 
+use admission::{AdmissionController, AdmissionDecision, EvictionPriority, EvictionSource};
+use async_trait::async_trait;
+use memory_probe::default_probe;
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::{Mutex, OwnedSemaphorePermit, Semaphore, TryAcquireError};
@@ -70,6 +75,11 @@ pub struct ActiveWorkers<Ctx: WorkerCtx> {
     concurrent_agents: Arc<ConcurrentAgentsScheduler>,
     priority_allocation_lock: Arc<Mutex<()>>,
     acquire_retry_delay: Duration,
+    /// Authoritative measured-headroom admission gate. Decides whether real
+    /// memory headroom permits a new acquisition, evicting via the worker set
+    /// when short. The estimate-based `worker_memory` semaphore is the cheap
+    /// pre-filter and atomic commit in front of it.
+    admission: AdmissionController,
 }
 
 #[derive(Debug)]
@@ -110,6 +120,10 @@ impl Drop for WorkerMemoryPermit {
 impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
     pub fn new(memory_config: &MemoryConfig, storage_config: &FilesystemStorageConfig) -> Self {
         let worker_memory_size = memory_config.worker_memory();
+        let admission = AdmissionController::new(
+            default_probe(memory_config.total_system_memory()),
+            memory_config.admission_policy(),
+        );
         let active_workers = Self {
             workers: Cache::new(
                 None,
@@ -125,6 +139,7 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
             concurrent_agents: Arc::new(ConcurrentAgentsScheduler::new()),
             acquire_retry_delay: memory_config.acquire_retry_delay,
             priority_allocation_lock: Arc::new(Mutex::new(())),
+            admission,
         };
         active_workers.initialize_metrics(worker_memory_size);
         active_workers
@@ -208,6 +223,20 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
             .expect("requested memory size is too large");
 
         loop {
+            // Authoritative measured-headroom gate. Evicts idle-then-warm when
+            // real headroom is short; rejects (and we back off) when it cannot
+            // make room rather than risking the limit.
+            if self
+                .admission
+                .try_admit(memory, &self.eviction_source())
+                .await
+                == AdmissionDecision::Reject
+            {
+                debug!("Measured headroom insufficient for {mem32}, backing off and retrying");
+                tokio::time::sleep(self.acquire_retry_delay).await;
+                continue;
+            }
+
             let available = self.worker_memory.available_permits();
             let lock = self.priority_allocation_lock.lock().await; // Block trying until a priority request is retrying once
             let result = self.worker_memory.clone().try_acquire_many_owned(mem32);
@@ -249,10 +278,32 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
         }
     }
 
+    /// Builds an [`EvictionSource`] view over the live worker set for the
+    /// admission controller to reclaim memory through.
+    fn eviction_source(&self) -> WorkerEvictionSource<Ctx> {
+        WorkerEvictionSource {
+            workers: self.workers.clone(),
+        }
+    }
+
     pub async fn try_acquire(&self, memory: u64) -> Option<WorkerMemoryPermit> {
         let mem32: u32 = memory
             .try_into()
             .expect("requested memory size is too large");
+
+        // Authoritative measured-headroom gate. Single attempt (this is the
+        // non-blocking path): if real headroom is insufficient even after
+        // eviction, do not admit.
+        if self
+            .admission
+            .try_admit(memory, &self.eviction_source())
+            .await
+            == AdmissionDecision::Reject
+        {
+            debug!("Measured headroom insufficient for {mem32}, not admitting");
+            return None;
+        }
+
         let mut lock = None;
         loop {
             match self.worker_memory.clone().try_acquire_many_owned(mem32) {
@@ -289,73 +340,23 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
         let current_avail = self.worker_memory.available_permits();
         let needed = memory.saturating_sub(current_avail as u64);
 
-        if needed > 0 {
-            let mut idle_candidates = Vec::new();
-            let mut warm_candidates = Vec::new();
-
-            debug!("Collecting memory eviction candidates");
-            let pairs = self.workers.iter().await;
-            for (agent_id, worker) in pairs {
-                if let Some(class) = worker.eviction_class().await
-                    && let Ok(mem) = worker.memory_requirement().await
-                {
-                    let last_changed = worker.last_execution_state_change();
-                    let entry = (agent_id, worker, mem, last_changed);
-                    match class {
-                        crate::worker::EvictionClass::LoadedIdle => {
-                            idle_candidates.push(entry);
-                        }
-                        crate::worker::EvictionClass::WarmRunnable => {
-                            warm_candidates.push(entry);
-                        }
-                    }
-                }
-            }
-
-            // Sort each bucket by timestamp — newest first so we pop oldest
-            idle_candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis());
-            idle_candidates.reverse();
-            warm_candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis());
-            warm_candidates.reverse();
-
-            let mut freed = 0u64;
-
-            // First evict LoadedIdle workers (cheapest)
-            while freed < needed && !idle_candidates.is_empty() {
-                let (agent_id, worker, mem, _) = idle_candidates.pop().unwrap();
-                debug!("Trying to stop idle {agent_id} to free up memory");
-                if worker
-                    .stop_if_evictable(crate::worker::EvictionClass::LoadedIdle)
-                    .await
-                {
-                    debug!("Stopped idle {agent_id} to free up {mem} memory");
-                    crate::metrics::workers::record_worker_eviction("LoadedIdle");
-                    freed += mem;
-                }
-            }
+        if needed == 0 {
+            debug!("Memory was freed up in the meantime");
+            return true;
+        }
 
-            // Then evict WarmRunnable workers if still under pressure
-            while freed < needed && !warm_candidates.is_empty() {
-                let (agent_id, worker, mem, _) = warm_candidates.pop().unwrap();
-                debug!("Trying to stop warm-runnable {agent_id} to free up memory");
-                if worker
-                    .stop_if_evictable(crate::worker::EvictionClass::WarmRunnable)
-                    .await
-                {
-                    debug!("Stopped warm-runnable {agent_id} to free up {mem} memory");
-                    crate::metrics::workers::record_worker_eviction("WarmRunnable");
-                    freed += mem;
-                }
+        let mut freed = 0u64;
+        for priority in [EvictionPriority::Idle, EvictionPriority::Warm] {
+            if freed >= needed {
+                break;
             }
+            freed += evict_at_most_memory(&self.workers, priority, needed - freed).await;
+        }
 
-            if freed > 0 {
-                debug!("Freed up {freed}");
-            }
-            freed >= needed
-        } else {
-            debug!("Memory was freed up in the meantime");
-            true
+        if freed > 0 {
+            debug!("Freed up {freed}");
         }
+        freed >= needed
     }
 
     /// Blocking acquire of storage semaphore permits. Loops until the requested
@@ -479,3 +480,66 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
         crate::metrics::storage::record_worker_memory_pool_total(worker_memory_size as u64);
     }
 }
+
+impl From<EvictionPriority> for crate::worker::EvictionClass {
+    fn from(priority: EvictionPriority) -> Self {
+        match priority {
+            EvictionPriority::Idle => crate::worker::EvictionClass::LoadedIdle,
+            EvictionPriority::Warm => crate::worker::EvictionClass::WarmRunnable,
+        }
+    }
+}
+
+/// Evicts resident workers at a single priority tier, oldest-first, stopping
+/// once at least `needed_bytes` have been freed or the tier is exhausted.
+/// Returns the bytes actually reclaimed.
+async fn evict_at_most_memory<Ctx: WorkerCtx>(
+    workers: &Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
+    priority: EvictionPriority,
+    needed_bytes: u64,
+) -> u64 {
+    let target_class: crate::worker::EvictionClass = priority.into();
+
+    let mut candidates = Vec::new();
+    for (agent_id, worker) in workers.iter().await {
+        if let Some(class) = worker.eviction_class().await
+            && class == target_class
+            && let Ok(mem) = worker.memory_requirement().await
+        {
+            let last_changed = worker.last_execution_state_change();
+            candidates.push((agent_id, worker, mem, last_changed));
+        }
+    }
+
+    // Sort by timestamp newest-first so we pop the oldest first.
+    candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis());
+    candidates.reverse();
+
+    let mut freed = 0u64;
+    while freed < needed_bytes && !candidates.is_empty() {
+        let (agent_id, worker, mem, _) = candidates.pop().unwrap();
+        debug!("Trying to stop {target_class:?} {agent_id} to free up memory");
+        if worker.stop_if_evictable(target_class).await {
+            debug!("Stopped {target_class:?} {agent_id} to free up {mem} memory");
+            crate::metrics::workers::record_worker_eviction(match priority {
+                EvictionPriority::Idle => "LoadedIdle",
+                EvictionPriority::Warm => "WarmRunnable",
+            });
+            freed += mem;
+        }
+    }
+    freed
+}
+
+/// Adapts the live worker set to the [`EvictionSource`] the admission controller
+/// drives. Holds a cheap clone of the worker cache handle.
+struct WorkerEvictionSource<Ctx: WorkerCtx> {
+    workers: Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
+}
+
+#[async_trait]
+impl<Ctx: WorkerCtx> EvictionSource for WorkerEvictionSource<Ctx> {
+    async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64 {
+        evict_at_most_memory(&self.workers, priority, needed_bytes).await
+    }
+}
diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs
index 76b7720bf0..29ea514cb6 100644
--- a/golem-worker-executor/src/services/golem_config.rs
+++ b/golem-worker-executor/src/services/golem_config.rs
@@ -969,6 +969,10 @@ pub struct MemoryConfig {
     /// so this term over-accounts per-worker memory for large components.
     /// Lower this (e.g. to 0.0) to size permits primarily off linear memory.
     pub component_size_coefficient: f64,
+    /// Bytes of measured headroom kept free below the usable ceiling as a margin
+    /// against concurrent admissions overshooting before becoming resident. Used
+    /// by the measured-headroom admission gate.
+    pub admission_reserve_bytes: u64,
     #[serde(with = "humantime_serde")]
     pub acquire_retry_delay: Duration,
     pub oom_retry_config: RetryConfig,
@@ -992,6 +996,17 @@ impl MemoryConfig {
     pub fn worker_memory(&self) -> usize {
         (self.total_system_memory() as f64 * self.worker_memory_ratio) as usize
     }
+
+    /// The admission policy for the measured-headroom gate. Reuses
+    /// `worker_memory_ratio` as the usable fraction of the measured limit (the
+    /// host keeps the remainder) and `admission_reserve_bytes` as the concurrent
+    /// overshoot margin.
+    pub fn admission_policy(&self) -> crate::services::active_workers::admission::AdmissionPolicy {
+        crate::services::active_workers::admission::AdmissionPolicy {
+            usable_ratio: self.worker_memory_ratio,
+            reserve_bytes: self.admission_reserve_bytes,
+        }
+    }
 }
 
 impl SafeDisplay for MemoryConfig {
@@ -1015,6 +1030,11 @@ impl SafeDisplay for MemoryConfig {
             "component size coefficient: {}",
             self.component_size_coefficient
         );
+        let _ = writeln!(
+            &mut result,
+            "admission reserve bytes: {}",
+            self.admission_reserve_bytes
+        );
         let _ = writeln!(
             &mut result,
             "acquire retry delay: {:?}",
@@ -1540,6 +1560,7 @@ impl Default for MemoryConfig {
             worker_memory_ratio: 0.8,
             worker_estimate_coefficient: 1.1,
             component_size_coefficient: 2.0,
+            admission_reserve_bytes: 256 * 1024 * 1024,
             acquire_retry_delay: Duration::from_millis(500),
             oom_retry_config: RetryConfig {
                 max_attempts: u32::MAX,

From 817c6726821dfb9195a6e54424d0acc455819d57 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Mon, 8 Jun 2026 21:30:57 -0700
Subject: [PATCH 21/60] feat(worker-executor): charge component module size
 once per resident component

---
 .../active_workers/component_charge/mod.rs    | 171 +++++++++++++
 .../active_workers/component_charge/tests.rs  | 206 ++++++++++++++++
 .../src/services/active_workers/mod.rs        | 230 ++++++++++++------
 .../src/services/golem_config.rs              |   7 +-
 golem-worker-executor/src/worker/mod.rs       |  64 ++++-
 5 files changed, 593 insertions(+), 85 deletions(-)
 create mode 100644 golem-worker-executor/src/services/active_workers/component_charge/mod.rs
 create mode 100644 golem-worker-executor/src/services/active_workers/component_charge/tests.rs

diff --git a/golem-worker-executor/src/services/active_workers/component_charge/mod.rs b/golem-worker-executor/src/services/active_workers/component_charge/mod.rs
new file mode 100644
index 0000000000..8ddd4aa8aa
--- /dev/null
+++ b/golem-worker-executor/src/services/active_workers/component_charge/mod.rs
@@ -0,0 +1,171 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Per-component memory charge for the shared compiled module.
+//!
+//! A component's compiled module is loaded into the wasmtime engine once and
+//! shared by every worker of that component, so its size must be charged to the
+//! memory pool once per resident component rather than once per worker. This
+//! registry tracks how many workers of each component are resident and holds a
+//! single module-sized charge for as long as at least one is.
+//!
+//! The charge is represented by an opaque guard obtained from a [`ChargeSource`]
+//! (the worker memory pool in production). The first resident worker of a
+//! component acquires the charge; the last to unload drops it. The registry is
+//! decoupled from the pool via [`ChargeSource`] so the refcounting can be
+//! property-tested in isolation.
+
+use async_trait::async_trait;
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::sync::{Arc, Mutex};
+
+/// Acquires an opaque, RAII charge of a given byte size from some pool. The
+/// returned value releases the charge when dropped.
+#[async_trait]
+pub trait ChargeSource: Send + Sync {
+    type Charge: Send + Sync + 'static;
+
+    async fn acquire_charge(&self, bytes: u64) -> Self::Charge;
+}
+
+/// Tracks resident-worker refcounts per component key and holds one module-sized
+/// charge per component while any worker of it is resident.
+pub struct ComponentChargeRegistry<K, S: ChargeSource> {
+    source: S,
+    state: Mutex<HashMap<K, Entry<S::Charge>>>,
+}
+
+struct Entry<C> {
+    refcount: usize,
+    /// The held module charge. Always `Some` while `refcount > 0`.
+    charge: Option<Arc<C>>,
+}
+
+/// Handle representing one worker's residency of a component. While at least one
+/// `ComponentChargeGuard` for a key is alive, the registry holds that
+/// component's module charge. Dropping the last guard releases it.
+pub struct ComponentChargeGuard<K, S: ChargeSource>
+where
+    K: Eq + Hash + Clone + Send + 'static,
+{
+    registry: Arc<ComponentChargeRegistry<K, S>>,
+    key: K,
+}
+
+impl<K, S> Debug for ComponentChargeGuard<K, S>
+where
+    K: Eq + Hash + Clone + Send + 'static,
+    S: ChargeSource,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ComponentChargeGuard").finish()
+    }
+}
+
+/// Type-erased held component charge. A worker holds one of these for as long as
+/// it is resident; dropping it releases the worker's residency of its component.
+/// Erasing the source/key types lets non-generic holders store the guard.
+pub trait HeldComponentCharge: Send + Sync + Debug {}
+
+impl<K, S> HeldComponentCharge for ComponentChargeGuard<K, S>
+where
+    K: Eq + Hash + Clone + Send + Sync + 'static,
+    S: ChargeSource + 'static,
+    S::Charge: Sync,
+{
+}
+
+impl<K, S> ComponentChargeRegistry<K, S>
+where
+    K: Eq + Hash + Clone + Send + 'static,
+    S: ChargeSource,
+{
+    pub fn new(source: S) -> Arc<Self> {
+        Arc::new(Self {
+            source,
+            state: Mutex::new(HashMap::new()),
+        })
+    }
+
+    /// Register one resident worker of `key` (whose module is `charge_bytes`).
+    /// Acquires the module charge if this is the first resident worker of the
+    /// component. The returned guard releases residency on drop.
+    pub async fn acquire(
+        self: &Arc<Self>,
+        key: K,
+        charge_bytes: u64,
+    ) -> ComponentChargeGuard<K, S> {
+        // Decide under the lock whether this caller is the one that must acquire
+        // the (possibly blocking) charge, so only the first resident worker of a
+        // component does so. Acquire the charge outside the lock, then publish it.
+        let must_acquire = {
+            let mut state = self.state.lock().unwrap();
+            let entry = state.entry(key.clone()).or_insert(Entry {
+                refcount: 0,
+                charge: None,
+            });
+            entry.refcount += 1;
+            entry.refcount == 1
+        };
+
+        if must_acquire {
+            let charge = Arc::new(self.source.acquire_charge(charge_bytes).await);
+            let mut state = self.state.lock().unwrap();
+            if let Some(entry) = state.get_mut(&key) {
+                // Only publish if still resident (refcount could have churned).
+                if entry.refcount > 0 && entry.charge.is_none() {
+                    entry.charge = Some(charge);
+                }
+            }
+        }
+
+        ComponentChargeGuard {
+            registry: self.clone(),
+            key,
+        }
+    }
+
+    fn release(&self, key: &K) {
+        let mut state = self.state.lock().unwrap();
+        if let Some(entry) = state.get_mut(key) {
+            entry.refcount = entry.refcount.saturating_sub(1);
+            if entry.refcount == 0 {
+                // Drop the held charge (returns it to the pool) and forget the
+                // component entirely.
+                state.remove(key);
+            }
+        }
+    }
+}
+
+impl<K, S> Drop for ComponentChargeGuard<K, S>
+where
+    K: Eq + Hash + Clone + Send + 'static,
+    S: ChargeSource,
+{
+    fn drop(&mut self) {
+        self.registry.release(&self.key);
+    }
+}
+
+impl<K, S: ChargeSource> Debug for ComponentChargeRegistry<K, S> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ComponentChargeRegistry").finish()
+    }
+}
+
+#[cfg(test)]
+mod tests;
diff --git a/golem-worker-executor/src/services/active_workers/component_charge/tests.rs b/golem-worker-executor/src/services/active_workers/component_charge/tests.rs
new file mode 100644
index 0000000000..c58f1ab937
--- /dev/null
+++ b/golem-worker-executor/src/services/active_workers/component_charge/tests.rs
@@ -0,0 +1,206 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Tests for the per-component module charge registry.
+//!
+//! A [`FakeChargeSource`] models a pool by tracking total charged bytes in an
+//! atomic; each charge it hands out decrements that total when dropped. The
+//! tests then assert the registry's contract: a component's module is charged
+//! exactly once while any worker of it is resident, released when the last
+//! unloads, and never leaked or double-charged under concurrent churn.
+
+use super::*;
+use proptest::prelude::*;
+use std::sync::atomic::{AtomicU64, Ordering};
+use test_r::test;
+
+test_r::enable!();
+
+/// A charge that returns `bytes` to the shared counter when dropped.
+struct FakeCharge {
+    bytes: u64,
+    charged_total: Arc<AtomicU64>,
+}
+
+impl Drop for FakeCharge {
+    fn drop(&mut self) {
+        self.charged_total.fetch_sub(self.bytes, Ordering::SeqCst);
+    }
+}
+
+#[derive(Clone)]
+struct FakeChargeSource {
+    charged_total: Arc<AtomicU64>,
+    /// Number of times a charge was actually acquired, to detect double-charge.
+    acquire_count: Arc<AtomicU64>,
+}
+
+impl FakeChargeSource {
+    fn new() -> Self {
+        Self {
+            charged_total: Arc::new(AtomicU64::new(0)),
+            acquire_count: Arc::new(AtomicU64::new(0)),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ChargeSource for FakeChargeSource {
+    type Charge = FakeCharge;
+
+    async fn acquire_charge(&self, bytes: u64) -> FakeCharge {
+        self.acquire_count.fetch_add(1, Ordering::SeqCst);
+        self.charged_total.fetch_add(bytes, Ordering::SeqCst);
+        FakeCharge {
+            bytes,
+            charged_total: self.charged_total.clone(),
+        }
+    }
+}
+
+const MODULE_BYTES: u64 = 17 * 1024 * 1024;
+
+// ── Single-case unit tests ───────────────────────────────────────────────────
+
+#[test]
+async fn first_worker_charges_once_last_releases() {
+    let source = FakeChargeSource::new();
+    let charged = source.charged_total.clone();
+    let count = source.acquire_count.clone();
+    let registry = ComponentChargeRegistry::new(source);
+
+    let g1 = registry.acquire("comp-a", MODULE_BYTES).await;
+    assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES);
+    assert_eq!(count.load(Ordering::SeqCst), 1);
+
+    // Second worker of the same component: no additional charge.
+    let g2 = registry.acquire("comp-a", MODULE_BYTES).await;
+    assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES);
+    assert_eq!(count.load(Ordering::SeqCst), 1);
+
+    // Dropping one of two keeps the charge.
+    drop(g1);
+    assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES);
+
+    // Dropping the last releases it.
+    drop(g2);
+    assert_eq!(charged.load(Ordering::SeqCst), 0);
+}
+
+#[test]
+async fn distinct_components_each_charge_once() {
+    let source = FakeChargeSource::new();
+    let charged = source.charged_total.clone();
+    let registry = ComponentChargeRegistry::new(source);
+
+    let _a = registry.acquire("comp-a", MODULE_BYTES).await;
+    let _b = registry.acquire("comp-b", MODULE_BYTES).await;
+    let _b2 = registry.acquire("comp-b", MODULE_BYTES).await;
+
+    // Two distinct components → charged twice, regardless of worker count.
+    assert_eq!(charged.load(Ordering::SeqCst), 2 * MODULE_BYTES);
+}
+
+#[test]
+async fn re_acquiring_after_full_release_charges_again() {
+    let source = FakeChargeSource::new();
+    let charged = source.charged_total.clone();
+    let count = source.acquire_count.clone();
+    let registry = ComponentChargeRegistry::new(source);
+
+    drop(registry.acquire("comp-a", MODULE_BYTES).await);
+    assert_eq!(charged.load(Ordering::SeqCst), 0);
+
+    // A fresh residency after full release acquires the charge again.
+    let _g = registry.acquire("comp-a", MODULE_BYTES).await;
+    assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES);
+    assert_eq!(count.load(Ordering::SeqCst), 2);
+}
+
+// ── Property tests ───────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone)]
+enum Op {
+    /// Acquire a guard for component index `usize`.
+    Acquire(usize),
+    /// Drop the n-th currently-held guard (modulo number held).
+    Drop(usize),
+}
+
+fn arb_ops(num_components: usize) -> impl Strategy<Value = Vec<Op>> {
+    prop::collection::vec(
+        prop_oneof![
+            (0..num_components).prop_map(Op::Acquire),
+            (0usize..100).prop_map(Op::Drop),
+        ],
+        0..80,
+    )
+}
+
+proptest! {
+    /// The charged total always equals the sum of `MODULE_BYTES` over the distinct
+    /// components that currently have at least one held guard. This is the core
+    /// "once per resident component" contract: never per-worker, never leaked,
+    /// never double-charged.
+    #[test]
+    fn charge_tracks_distinct_resident_components(
+        num_components in 1usize..6,
+        ops in arb_ops(6),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let source = FakeChargeSource::new();
+            let charged = source.charged_total.clone();
+            let registry = ComponentChargeRegistry::new(source);
+
+            // Held guards keyed by component index.
+            let mut held: Vec<(usize, ComponentChargeGuard<&'static str, FakeChargeSource>)> =
+                Vec::new();
+            let keys: Vec<&'static str> =
+                ["c0", "c1", "c2", "c3", "c4", "c5"][..num_components].to_vec();
+
+            for op in ops {
+                match op {
+                    Op::Acquire(i) => {
+                        let i = i % num_components;
+                        let guard = registry.acquire(keys[i], MODULE_BYTES).await;
+                        held.push((i, guard));
+                    }
+                    Op::Drop(n) => {
+                        if !held.is_empty() {
+                            let idx = n % held.len();
+                            held.remove(idx);
+                        }
+                    }
+                }
+
+                // Distinct resident component count == charged_total / MODULE_BYTES.
+                let mut distinct: Vec<usize> = held.iter().map(|(i, _)| *i).collect();
+                distinct.sort_unstable();
+                distinct.dedup();
+                let expected = distinct.len() as u64 * MODULE_BYTES;
+                prop_assert_eq!(
+                    charged.load(Ordering::SeqCst),
+                    expected,
+                    "charged total did not match distinct resident components"
+                );
+            }
+
+            // After dropping everything, nothing remains charged.
+            drop(held);
+            prop_assert_eq!(charged.load(Ordering::SeqCst), 0);
+            Ok(())
+        }).unwrap();
+    }
+}
diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs
index 00a3fc6f4d..34f8b190ec 100644
--- a/golem-worker-executor/src/services/active_workers/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/mod.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 pub mod admission;
+pub mod component_charge;
 pub mod concurrent_agents_scheduler;
 pub mod concurrent_agents_semaphore;
 pub mod fs_semaphore;
@@ -30,6 +31,8 @@ pub use fs_semaphore::{
 
 use admission::{AdmissionController, AdmissionDecision, EvictionPriority, EvictionSource};
 use async_trait::async_trait;
+pub use component_charge::HeldComponentCharge;
+use component_charge::{ChargeSource, ComponentChargeGuard, ComponentChargeRegistry};
 use memory_probe::default_probe;
 use std::sync::Arc;
 use std::time::Duration;
@@ -45,7 +48,7 @@ use crate::workerctx::WorkerCtx;
 use golem_common::cache::{BackgroundEvictionMode, Cache, FullCacheEvictionMode, SimpleCache};
 use golem_common::model::account::AccountId;
 use golem_common::model::agent::Principal;
-use golem_common::model::component::ComponentRevision;
+use golem_common::model::component::{ComponentId, ComponentRevision};
 use golem_common::model::environment::EnvironmentId;
 use golem_common::model::invocation_context::InvocationContextStack;
 use golem_common::model::worker::AgentConfigEntryDto;
@@ -80,8 +83,22 @@ pub struct ActiveWorkers<Ctx: WorkerCtx> {
     /// when short. The estimate-based `worker_memory` semaphore is the cheap
     /// pre-filter and atomic commit in front of it.
     admission: AdmissionController,
+    /// Charges each resident component's compiled module size to the estimate
+    /// pool exactly once (shared across all its workers) rather than per worker.
+    component_charges:
+        Arc<ComponentChargeRegistry<ComponentChargeKey, MemoryPoolChargeSource<Ctx>>>,
+    /// Multiplier applied to a component's `component_size` when sizing its
+    /// module charge permit.
+    component_size_coefficient: f64,
 }
 
+/// Identifies a compiled component for module-charge accounting.
+type ComponentChargeKey = (ComponentId, ComponentRevision);
+
+/// Guard held by a resident worker keeping its component's module charge alive.
+pub type WorkerComponentCharge<Ctx> =
+    ComponentChargeGuard<ComponentChargeKey, MemoryPoolChargeSource<Ctx>>;
+
 #[derive(Debug)]
 pub struct WorkerMemoryPermit {
     permit: Option<OwnedSemaphorePermit>,
@@ -124,27 +141,56 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
             default_probe(memory_config.total_system_memory()),
             memory_config.admission_policy(),
         );
+        let workers = Cache::new(
+            None,
+            FullCacheEvictionMode::None,
+            BackgroundEvictionMode::None,
+            "active_workers",
+        );
+        let worker_memory = Arc::new(Semaphore::new(worker_memory_size));
+        let priority_allocation_lock = Arc::new(Mutex::new(()));
+        let component_charges = ComponentChargeRegistry::new(MemoryPoolChargeSource {
+            worker_memory: worker_memory.clone(),
+            workers: workers.clone(),
+            priority_allocation_lock: priority_allocation_lock.clone(),
+            acquire_retry_delay: memory_config.acquire_retry_delay,
+        });
         let active_workers = Self {
-            workers: Cache::new(
-                None,
-                FullCacheEvictionMode::None,
-                BackgroundEvictionMode::None,
-                "active_workers",
-            ),
-            worker_memory: Arc::new(Semaphore::new(worker_memory_size)),
+            workers,
+            worker_memory,
             worker_filesystem_storage: Arc::new(FilesystemStorageSemaphore::new(
                 storage_config.worker_filesystem_storage(),
                 storage_config.acquire_retry_delay,
             )),
             concurrent_agents: Arc::new(ConcurrentAgentsScheduler::new()),
             acquire_retry_delay: memory_config.acquire_retry_delay,
-            priority_allocation_lock: Arc::new(Mutex::new(())),
+            priority_allocation_lock,
             admission,
+            component_charges,
+            component_size_coefficient: memory_config.component_size_coefficient,
         };
         active_workers.initialize_metrics(worker_memory_size);
         active_workers
     }
 
+    /// Acquire (or share) the per-component module charge for a worker of the
+    /// given component. The first resident worker of the component pays its
+    /// compiled-module size (scaled by `component_size_coefficient`) into the
+    /// estimate pool; subsequent workers share the same charge. The returned
+    /// guard releases residency on drop, and the charge is freed when the last
+    /// worker of the component unloads.
+    pub async fn acquire_component_charge(
+        &self,
+        component_id: ComponentId,
+        component_revision: ComponentRevision,
+        component_module_bytes: u64,
+    ) -> WorkerComponentCharge<Ctx> {
+        let charge_bytes = (self.component_size_coefficient * component_module_bytes as f64) as u64;
+        self.component_charges
+            .acquire((component_id, component_revision), charge_bytes)
+            .await
+    }
+
     pub async fn get_or_add<T>(
         &self,
         deps: &T,
@@ -237,44 +283,21 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
                 continue;
             }
 
-            let available = self.worker_memory.available_permits();
-            let lock = self.priority_allocation_lock.lock().await; // Block trying until a priority request is retrying once
-            let result = self.worker_memory.clone().try_acquire_many_owned(mem32);
-            drop(lock);
-            match result {
-                Ok(permit) => {
-                    debug!(
-                        "Acquired {} memory of {}, new available: {}, permit size: {}",
-                        mem32,
-                        available,
-                        self.worker_memory.available_permits(),
-                        permit.num_permits()
-                    );
-                    break WorkerMemoryPermit::new(permit);
-                }
-                Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"),
-                Err(TryAcquireError::NoPermits) => {
-                    debug!(
-                        "Not enough memory to allocate {mem32} (available: {}), trying to free some up",
-                        self.worker_memory.available_permits()
-                    );
-                    if self.try_free_up_memory(memory).await {
-                        debug!("Freed up some memory, retrying");
-                        // We have enough memory unless another worker has taken it in the meantime,
-                        // so retry the loop
-                        continue;
-                    } else {
-                        debug!(
-                            "Could not free up memory, retrying asking for permits after some time"
-                        );
-                        // Could not free up enough memory, so waiting for permits to be available.
-                        // We cannot use acquire_many() to wait for the permits because it eagerly preallocates
-                        // the available permits, and by that causing deadlocks. So we sleep and retry.
-
-                        tokio::time::sleep(self.acquire_retry_delay).await;
-                    }
-                }
+            // Estimate-semaphore pool: cheap pre-check + atomic commit.
+            if let Some(permit) = acquire_pool_permit(
+                &self.worker_memory,
+                &self.workers,
+                &self.priority_allocation_lock,
+                self.acquire_retry_delay,
+                mem32,
+                memory,
+            )
+            .await
+            {
+                break permit;
             }
+            // Pool could not satisfy the estimate even after eviction; loop and
+            // re-run the gate before trying again.
         }
     }
 
@@ -336,29 +359,6 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
         }
     }
 
-    async fn try_free_up_memory(&self, memory: u64) -> bool {
-        let current_avail = self.worker_memory.available_permits();
-        let needed = memory.saturating_sub(current_avail as u64);
-
-        if needed == 0 {
-            debug!("Memory was freed up in the meantime");
-            return true;
-        }
-
-        let mut freed = 0u64;
-        for priority in [EvictionPriority::Idle, EvictionPriority::Warm] {
-            if freed >= needed {
-                break;
-            }
-            freed += evict_at_most_memory(&self.workers, priority, needed - freed).await;
-        }
-
-        if freed > 0 {
-            debug!("Freed up {freed}");
-        }
-        freed >= needed
-    }
-
     /// Blocking acquire of storage semaphore permits. Loops until the requested
     /// number of bytes is available, evicting idle workers as needed.
     pub async fn acquire_filesystem_storage(&self, storage_bytes: u64) -> FilesystemStoragePermit {
@@ -531,8 +531,62 @@ async fn evict_at_most_memory<Ctx: WorkerCtx>(
     freed
 }
 
-/// Adapts the live worker set to the [`EvictionSource`] the admission controller
-/// drives. Holds a cheap clone of the worker cache handle.
+/// Frees up to `memory` estimate-permit bytes by evicting idle-then-warm
+/// workers, accounting for permits already available. Returns true when enough
+/// is (or was already) free.
+async fn try_free_up_pool_memory<Ctx: WorkerCtx>(
+    worker_memory: &Semaphore,
+    workers: &Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
+    memory: u64,
+) -> bool {
+    let current_avail = worker_memory.available_permits();
+    let needed = memory.saturating_sub(current_avail as u64);
+    if needed == 0 {
+        return true;
+    }
+
+    let mut freed = 0u64;
+    for priority in [EvictionPriority::Idle, EvictionPriority::Warm] {
+        if freed >= needed {
+            break;
+        }
+        freed += evict_at_most_memory(workers, priority, needed - freed).await;
+    }
+    freed >= needed
+}
+
+/// Single estimate-semaphore acquisition attempt with eviction. Returns the
+/// permit on success, or `None` when the pool cannot satisfy `mem32` even after
+/// evicting idle/warm workers (caller decides whether to retry). Shared by
+/// `ActiveWorkers::acquire` and the per-component charge source so there is one
+/// pool-acquire implementation.
+async fn acquire_pool_permit<Ctx: WorkerCtx>(
+    worker_memory: &Arc<Semaphore>,
+    workers: &Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
+    priority_allocation_lock: &Mutex<()>,
+    acquire_retry_delay: Duration,
+    mem32: u32,
+    memory: u64,
+) -> Option<WorkerMemoryPermit> {
+    let lock = priority_allocation_lock.lock().await; // Block trying until a priority request is retrying once
+    let result = worker_memory.clone().try_acquire_many_owned(mem32);
+    drop(lock);
+    match result {
+        Ok(permit) => Some(WorkerMemoryPermit::new(permit)),
+        Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"),
+        Err(TryAcquireError::NoPermits) => {
+            if try_free_up_pool_memory(worker_memory, workers, memory).await {
+                // Freed enough; signal the caller to retry the acquire.
+                None
+            } else {
+                // Could not free enough; wait before the caller retries.
+                tokio::time::sleep(acquire_retry_delay).await;
+                None
+            }
+        }
+    }
+}
+
 struct WorkerEvictionSource<Ctx: WorkerCtx> {
     workers: Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
 }
@@ -543,3 +597,37 @@ impl<Ctx: WorkerCtx> EvictionSource for WorkerEvictionSource<Ctx> {
         evict_at_most_memory(&self.workers, priority, needed_bytes).await
     }
 }
+
+/// Production [`ChargeSource`] for the per-component module charge. Takes
+/// estimate-semaphore permits via the same pool acquire+evict path as worker
+/// memory (the measured-headroom gate already accounts for the resident module
+/// via real RSS, so the charge does not pass through it).
+pub struct MemoryPoolChargeSource<Ctx: WorkerCtx> {
+    worker_memory: Arc<Semaphore>,
+    workers: Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
+    priority_allocation_lock: Arc<Mutex<()>>,
+    acquire_retry_delay: Duration,
+}
+
+#[async_trait]
+impl<Ctx: WorkerCtx> ChargeSource for MemoryPoolChargeSource<Ctx> {
+    type Charge = WorkerMemoryPermit;
+
+    async fn acquire_charge(&self, bytes: u64) -> WorkerMemoryPermit {
+        let mem32: u32 = bytes.try_into().expect("component charge size too large");
+        loop {
+            if let Some(permit) = acquire_pool_permit(
+                &self.worker_memory,
+                &self.workers,
+                &self.priority_allocation_lock,
+                self.acquire_retry_delay,
+                mem32,
+                bytes,
+            )
+            .await
+            {
+                break permit;
+            }
+        }
+    }
+}
diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs
index 29ea514cb6..fdac19ed9c 100644
--- a/golem-worker-executor/src/services/golem_config.rs
+++ b/golem-worker-executor/src/services/golem_config.rs
@@ -963,11 +963,8 @@ pub struct MemoryConfig {
     pub system_memory_override: Option<u64>,
     pub worker_memory_ratio: f64,
     pub worker_estimate_coefficient: f64,
-    /// Multiplier applied to a worker's `component_size` when estimating its
-    /// memory permit requirement. The compiled component is loaded into the
-    /// engine once per component (shared across all workers of that component),
-    /// so this term over-accounts per-worker memory for large components.
-    /// Lower this (e.g. to 0.0) to size permits primarily off linear memory.
+    /// Multiplier applied to a component's `component_size`, charged once per
+    /// resident component (shared across all its workers) rather than per worker.
     pub component_size_coefficient: f64,
     /// Bytes of measured headroom kept free below the usable ceiling as a margin
     /// against concurrent admissions overshooting before becoming resident. Used
diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs
index a65d6dd867..e9e8dbed8f 100644
--- a/golem-worker-executor/src/worker/mod.rs
+++ b/golem-worker-executor/src/worker/mod.rs
@@ -27,7 +27,8 @@ use crate::durable_host::recover_stderr_logs;
 use crate::metrics::storage::record_filesystem_pool_released;
 use crate::model::{AgentConfig, ExecutionStatus, LookupResult, ReadFileResult, TrapType};
 use crate::services::active_workers::{
-    FilesystemStoragePermit, RegisteredConcurrentAccount, WorkerMemoryPermit,
+    FilesystemStoragePermit, HeldComponentCharge, RegisteredConcurrentAccount,
+    WorkerComponentCharge, WorkerMemoryPermit,
 };
 use crate::services::events::{Event, EventsSubscription};
 use crate::services::golem_config::SnapshotPolicy;
@@ -58,6 +59,7 @@ use golem_common::model::agent::{
     AgentMode, ParsedAgentId, Principal, Snapshotting, SnapshottingConfig,
 };
 use golem_common::model::component::CanonicalFilePath;
+use golem_common::model::component::ComponentId;
 use golem_common::model::component::ComponentRevision;
 use golem_common::model::invocation_context::InvocationContextStack;
 use golem_common::model::oplog::{OplogEntry, OplogIndex, UpdateDescription};
@@ -122,7 +124,6 @@ pub struct Worker<Ctx: WorkerCtx> {
     execution_status: Arc<std::sync::RwLock<ExecutionStatus>>,
     update_state_lock: Mutex<()>,
     worker_estimate_coefficient: f64,
-    component_size_coefficient: f64,
 
     // IMPORTANT: Every external operation must acquire the instance lock, even briefly, to confirm the worker isn’t deleting.
     instance: Arc<Mutex<WorkerInstance>>,
@@ -341,7 +342,6 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             last_known_status: current_status,
             metrics_status,
             worker_estimate_coefficient: deps.config().memory.worker_estimate_coefficient,
-            component_size_coefficient: deps.config().memory.component_size_coefficient,
             oom_retry_config: deps.config().memory.oom_retry_config.clone(),
             snapshot_policy,
             update_state_lock: Mutex::new(()),
@@ -797,15 +797,29 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
         self.execution_status.read().unwrap().agent_mode()
     }
 
-    /// Gets the estimated memory requirement of the worker
+    /// Gets the estimated memory requirement of the worker.
+    ///
+    /// This covers only the per-worker linear memory. The compiled component
+    /// module is shared by all workers of a component and is charged once per
+    /// resident component via the component-charge registry, not per worker.
     pub async fn memory_requirement(&self) -> Result<u64, WorkerExecutorError> {
         let metadata = self.get_latest_worker_metadata().await;
 
-        let ml = metadata.last_known_status.total_linear_memory_size as f64;
-        let sw = metadata.last_known_status.component_size as f64;
-        let c = self.component_size_coefficient;
-        let x = self.worker_estimate_coefficient;
-        Ok((x * (ml + c * sw)) as u64)
+        let linear_memory_bytes = metadata.last_known_status.total_linear_memory_size as f64;
+        let estimate_coefficient = self.worker_estimate_coefficient;
+        Ok((estimate_coefficient * linear_memory_bytes) as u64)
+    }
+
+    /// Returns the component identity and compiled-module size used to charge
+    /// the shared module memory once per resident component.
+    pub async fn component_charge_requirement(
+        &self,
+    ) -> Result<(ComponentId, ComponentRevision, u64), WorkerExecutorError> {
+        let metadata = self.get_latest_worker_metadata().await;
+        let component_id = self.owned_agent_id.component_id();
+        let component_revision = metadata.last_known_status.component_revision;
+        let component_module_bytes = metadata.last_known_status.component_size;
+        Ok((component_id, component_revision, component_module_bytes))
     }
 
     /// Gets the storage requirement of the worker based on the last known status.
@@ -2192,6 +2206,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
     async fn start_waiting_worker(
         this: Arc<Worker<Ctx>>,
         permit: WorkerMemoryPermit,
+        component_charge: WorkerComponentCharge<Ctx>,
         filesystem_storage_permit: Option<FilesystemStoragePermit>,
         concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit,
         oom_retry_count: u32,
@@ -2207,6 +2222,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
                     this.queue.clone(),
                     this.clone(),
                     permit,
+                    component_charge,
                     concurrent_agent_permit,
                     oom_retry_count,
                 )
@@ -2361,6 +2377,27 @@ impl WaitingWorker {
                 // concurrency slot. Otherwise one account could fill the memory
                 // pool with workers that are not allowed to run yet.
                 let permit = parent.active_workers().acquire(memory_requirement).await;
+                // Charge the component's compiled module size once per resident
+                // component (shared by all its workers). Held for as long as this
+                // worker is resident.
+                let component_charge = match parent.component_charge_requirement().await {
+                    Ok((component_id, component_revision, component_module_bytes)) => {
+                        parent
+                            .active_workers()
+                            .acquire_component_charge(
+                                component_id,
+                                component_revision,
+                                component_module_bytes,
+                            )
+                            .await
+                    }
+                    Err(err) => {
+                        warn!(
+                            "Failed to determine component charge requirement, not starting: {err}"
+                        );
+                        return;
+                    }
+                };
                 // Pre-acquire storage permits for this restart.
                 //
                 // We need to acquire `filesystem_storage_requirement + desired_extra` total:
@@ -2412,6 +2449,7 @@ impl WaitingWorker {
                 Worker::start_waiting_worker(
                     parent,
                     permit,
+                    component_charge,
                     filesystem_storage_permit,
                     concurrent_agent_permit,
                     oom_retry_count,
@@ -2444,6 +2482,12 @@ struct RunningWorker {
     sender: UnboundedSender<WorkerCommand>,
     queue: Arc<RwLock<VecDeque<QueuedWorkerInvocation>>>,
     permit: WorkerMemoryPermit,
+    /// Keeps this worker's component module charge alive for as long as the
+    /// worker is resident. Held only to be dropped: dropping it releases the
+    /// component's residency, and the module charge if this was the last worker
+    /// of the component.
+    #[allow(dead_code)]
+    component_charge: Box<dyn HeldComponentCharge>,
     /// Storage semaphore permits held by this worker. `None` until storage
     /// space is first acquired (at startup or on first write). Dropped
     /// automatically when `RunningWorker` is dropped, returning storage
@@ -2475,6 +2519,7 @@ impl RunningWorker {
         queue: Arc<RwLock<VecDeque<QueuedWorkerInvocation>>>,
         parent: Arc<Worker<Ctx>>,
         permit: WorkerMemoryPermit,
+        component_charge: WorkerComponentCharge<Ctx>,
         concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit,
         oom_retry_count: u32,
     ) -> Self {
@@ -2525,6 +2570,7 @@ impl RunningWorker {
             sender,
             queue,
             permit,
+            component_charge: Box::new(component_charge),
             filesystem_storage_permit: None,
             waiting_for_command,
             interrupt_signal,

From 35874d34e7a628a5093dee4e8c4f4843b8305adf Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 00:00:46 -0700
Subject: [PATCH 22/60] fix(worker-executor): disable measured admission when
 executor does not own its memory environment

---
 .../config/debug-worker-executor.sample.env   |  2 +
 .../config/debug-worker-executor.toml         |  2 +
 golem-worker-executor-test-utils/src/lib.rs   | 12 +++++
 .../config/worker-executor.sample.env         |  3 ++
 .../config/worker-executor.toml               |  3 ++
 .../services/active_workers/memory_probe.rs   | 27 ++++++++----
 .../src/services/active_workers/mod.rs        | 44 +++++++++----------
 .../src/services/golem_config.rs              | 13 ++++++
 8 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env
index d717cf777a..7d95b6f7dc 100644
--- a/golem-debugging-service/config/debug-worker-executor.sample.env
+++ b/golem-debugging-service/config/debug-worker-executor.sample.env
@@ -57,6 +57,7 @@ GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
 GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
+GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
@@ -232,6 +233,7 @@ GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
 GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
+GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml
index 7d23b08cd5..316dddd29a 100644
--- a/golem-debugging-service/config/debug-worker-executor.toml
+++ b/golem-debugging-service/config/debug-worker-executor.toml
@@ -98,6 +98,7 @@ max_oplog_query_pages_size = 100
 acquire_retry_delay = "500ms"
 admission_reserve_bytes = 268435456
 component_size_coefficient = 2.0
+enable_measured_admission = true
 worker_estimate_coefficient = 1.1
 worker_memory_ratio = 0.8
 
@@ -368,6 +369,7 @@ without_time = false
 # acquire_retry_delay = "500ms"
 # admission_reserve_bytes = 268435456
 # component_size_coefficient = 2.0
+# enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
 # 
diff --git a/golem-worker-executor-test-utils/src/lib.rs b/golem-worker-executor-test-utils/src/lib.rs
index ee81b41531..fcfb661670 100644
--- a/golem-worker-executor-test-utils/src/lib.rs
+++ b/golem-worker-executor-test-utils/src/lib.rs
@@ -533,6 +533,15 @@ fn make_base_test_config(deps: &WorkerExecutorTestDependencies) -> GolemConfig {
         // without attempting a gRPC connection to a registry service that does
         // not exist in this test setup.
         resource_limits: ResourceLimitsConfig::Disabled(ResourceLimitsDisabledConfig {}),
+        // The measured-headroom admission gate requires the executor to own its
+        // memory environment (cgroup/process). The in-process test harness runs
+        // the executor alongside the test framework and other services, so the
+        // probe cannot isolate this executor's footprint — disable it and gate on
+        // the estimate semaphore alone, matching pre-gate behaviour.
+        memory: MemoryConfig {
+            enable_measured_admission: false,
+            ..Default::default()
+        },
         ..Default::default()
     }
 }
@@ -696,6 +705,9 @@ pub async fn start_customized(
     apply_sqlite_storage_config(&mut config, deps, context);
     config.memory = MemoryConfig {
         system_memory_override,
+        // Measured admission disabled in the shared in-process test harness; the
+        // small system_memory_override here drives the estimate semaphore alone.
+        enable_measured_admission: false,
         ..Default::default()
     };
     config.filesystem_storage = FilesystemStorageConfig {
diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env
index dc33d7b3c1..bc7bf2c3c0 100644
--- a/golem-worker-executor/config/worker-executor.sample.env
+++ b/golem-worker-executor/config/worker-executor.sample.env
@@ -74,6 +74,7 @@ GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
 GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
+GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
@@ -295,6 +296,7 @@ GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
 GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
+GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
@@ -486,6 +488,7 @@ GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
 GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
+GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml
index 265ec5f904..819c4fe03d 100644
--- a/golem-worker-executor/config/worker-executor.toml
+++ b/golem-worker-executor/config/worker-executor.toml
@@ -127,6 +127,7 @@ max_oplog_query_pages_size = 100
 acquire_retry_delay = "500ms"
 admission_reserve_bytes = 268435456
 component_size_coefficient = 2.0
+enable_measured_admission = true
 worker_estimate_coefficient = 1.1
 worker_memory_ratio = 0.8
 
@@ -460,6 +461,7 @@ without_time = false
 # acquire_retry_delay = "500ms"
 # admission_reserve_bytes = 268435456
 # component_size_coefficient = 2.0
+# enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
 # 
@@ -763,6 +765,7 @@ without_time = false
 # acquire_retry_delay = "500ms"
 # admission_reserve_bytes = 268435456
 # component_size_coefficient = 2.0
+# enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
 # 
diff --git a/golem-worker-executor/src/services/active_workers/memory_probe.rs b/golem-worker-executor/src/services/active_workers/memory_probe.rs
index 0d1c4088a3..346a3dd363 100644
--- a/golem-worker-executor/src/services/active_workers/memory_probe.rs
+++ b/golem-worker-executor/src/services/active_workers/memory_probe.rs
@@ -163,18 +163,29 @@ impl MemoryProbe for CgroupV2Probe {
     }
 }
 
-/// Constructs the best available probe for the current platform.
+/// Constructs the best available probe.
 ///
-/// On Linux, prefers cgroup v2; falls back to process RSS. On other targets,
-/// uses process RSS until dedicated backends land. `limit_bytes` is the limit
-/// to charge against and is also the fallback when the cgroup reports an
-/// unlimited `memory.max`.
-pub fn default_probe(limit_bytes: u64) -> Box<dyn MemoryProbe> {
+/// When `memory_override` is set, the limit is self-declared and treated as an
+/// isolated budget measured against this process's RSS — the executor does not
+/// assume it owns a cgroup. When it is `None`, the executor is assumed to own
+/// its memory environment, so on Linux the exact cgroup v2 numbers are used
+/// (falling back to host RAM / process RSS otherwise).
+pub fn default_probe(memory_override: Option<u64>) -> Box<dyn MemoryProbe> {
+    if let Some(limit) = memory_override {
+        return Box::new(ProcessRssProbe::new(limit));
+    }
+
+    let host_ram = {
+        let mut sysinfo = sysinfo::System::new();
+        sysinfo.refresh_memory();
+        sysinfo.total_memory()
+    };
+
     #[cfg(target_os = "linux")]
     {
-        if let Some(probe) = CgroupV2Probe::try_new(limit_bytes) {
+        if let Some(probe) = CgroupV2Probe::try_new(host_ram) {
             return Box::new(probe);
         }
     }
-    Box::new(ProcessRssProbe::new(limit_bytes))
+    Box::new(ProcessRssProbe::new(host_ram))
 }
diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs
index 34f8b190ec..7e95da1703 100644
--- a/golem-worker-executor/src/services/active_workers/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/mod.rs
@@ -81,8 +81,10 @@ pub struct ActiveWorkers<Ctx: WorkerCtx> {
     /// Authoritative measured-headroom admission gate. Decides whether real
     /// memory headroom permits a new acquisition, evicting via the worker set
     /// when short. The estimate-based `worker_memory` semaphore is the cheap
-    /// pre-filter and atomic commit in front of it.
-    admission: AdmissionController,
+    /// pre-filter and atomic commit in front of it. `None` when measured
+    /// admission is disabled (e.g. shared test environments) — admission then
+    /// relies on the estimate semaphore alone.
+    admission: Option<AdmissionController>,
     /// Charges each resident component's compiled module size to the estimate
     /// pool exactly once (shared across all its workers) rather than per worker.
     component_charges:
@@ -137,10 +139,12 @@ impl Drop for WorkerMemoryPermit {
 impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
     pub fn new(memory_config: &MemoryConfig, storage_config: &FilesystemStorageConfig) -> Self {
         let worker_memory_size = memory_config.worker_memory();
-        let admission = AdmissionController::new(
-            default_probe(memory_config.total_system_memory()),
-            memory_config.admission_policy(),
-        );
+        let admission = memory_config.enable_measured_admission.then(|| {
+            AdmissionController::new(
+                default_probe(memory_config.system_memory_override),
+                memory_config.admission_policy(),
+            )
+        });
         let workers = Cache::new(
             None,
             FullCacheEvictionMode::None,
@@ -269,14 +273,12 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
             .expect("requested memory size is too large");
 
         loop {
-            // Authoritative measured-headroom gate. Evicts idle-then-warm when
-            // real headroom is short; rejects (and we back off) when it cannot
-            // make room rather than risking the limit.
-            if self
-                .admission
-                .try_admit(memory, &self.eviction_source())
-                .await
-                == AdmissionDecision::Reject
+            // Authoritative measured-headroom gate (when enabled). Evicts
+            // idle-then-warm when real headroom is short; rejects (and we back
+            // off) when it cannot make room rather than risking the limit.
+            if let Some(admission) = &self.admission
+                && admission.try_admit(memory, &self.eviction_source()).await
+                    == AdmissionDecision::Reject
             {
                 debug!("Measured headroom insufficient for {mem32}, backing off and retrying");
                 tokio::time::sleep(self.acquire_retry_delay).await;
@@ -314,14 +316,12 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
             .try_into()
             .expect("requested memory size is too large");
 
-        // Authoritative measured-headroom gate. Single attempt (this is the
-        // non-blocking path): if real headroom is insufficient even after
-        // eviction, do not admit.
-        if self
-            .admission
-            .try_admit(memory, &self.eviction_source())
-            .await
-            == AdmissionDecision::Reject
+        // Authoritative measured-headroom gate (when enabled). Single attempt
+        // (this is the non-blocking path): if real headroom is insufficient even
+        // after eviction, do not admit.
+        if let Some(admission) = &self.admission
+            && admission.try_admit(memory, &self.eviction_source()).await
+                == AdmissionDecision::Reject
         {
             debug!("Measured headroom insufficient for {mem32}, not admitting");
             return None;
diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs
index fdac19ed9c..5a95e0056f 100644
--- a/golem-worker-executor/src/services/golem_config.rs
+++ b/golem-worker-executor/src/services/golem_config.rs
@@ -970,6 +970,13 @@ pub struct MemoryConfig {
     /// against concurrent admissions overshooting before becoming resident. Used
     /// by the measured-headroom admission gate.
     pub admission_reserve_bytes: u64,
+    /// Whether the measured-headroom admission gate is active. Requires the
+    /// executor to own its memory environment (its own cgroup/process), as in a
+    /// production pod. Disable in shared environments — such as the in-process
+    /// test harness — where the probe cannot isolate this executor's footprint
+    /// from co-resident processes; admission then relies on the estimate
+    /// semaphore alone.
+    pub enable_measured_admission: bool,
     #[serde(with = "humantime_serde")]
     pub acquire_retry_delay: Duration,
     pub oom_retry_config: RetryConfig,
@@ -1032,6 +1039,11 @@ impl SafeDisplay for MemoryConfig {
             "admission reserve bytes: {}",
             self.admission_reserve_bytes
         );
+        let _ = writeln!(
+            &mut result,
+            "measured admission enabled: {}",
+            self.enable_measured_admission
+        );
         let _ = writeln!(
             &mut result,
             "acquire retry delay: {:?}",
@@ -1558,6 +1570,7 @@ impl Default for MemoryConfig {
             worker_estimate_coefficient: 1.1,
             component_size_coefficient: 2.0,
             admission_reserve_bytes: 256 * 1024 * 1024,
+            enable_measured_admission: true,
             acquire_retry_delay: Duration::from_millis(500),
             oom_retry_config: RetryConfig {
                 max_attempts: u32::MAX,

From acb9968ca3008f71d6ee254148fd085933ec9a9a Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 00:01:18 -0700
Subject: [PATCH 23/60] feat(benchmark): add throughput-under-memory-saturation
 benchmarks

---
 .../cloud-density-saturation.yaml             |  64 +++
 integration-tests/src/benchmarks/all.rs       |  24 +
 integration-tests/src/benchmarks/mod.rs       |   1 +
 .../src/benchmarks/throughput_saturation.rs   | 423 ++++++++++++++++++
 test-components/agent-counters/src/lib.rs     |  91 +++-
 5 files changed, 601 insertions(+), 2 deletions(-)
 create mode 100644 integration-tests/benchmark_suites/cloud-density-saturation.yaml
 create mode 100644 integration-tests/src/benchmarks/throughput_saturation.rs

diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
new file mode 100644
index 0000000000..59d1409efc
--- /dev/null
+++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
@@ -0,0 +1,64 @@
+# Cloud throughput-saturation benchmark suite.
+#
+# Unlike cloud-perf's throughput benchmarks (which keep `size` small enough that
+# all workers fit in memory), this suite deliberately ramps the number of
+# active, memory-holding agents up to and past the executor's memory ceiling to
+# find the per-pod active-agent capacity and the throughput sustained once
+# memory is exhausted.
+#
+# Each agent retains a deterministic, per-agent-distinct amount of resident
+# memory, so the fleet presents a mix of footprints near the limit (exercising
+# the admission/eviction path). The measured phase drives one in-flight
+# `busy_for` call per agent and records aggregate throughput.
+#
+# Run with the benchmarks binary's `cloud` subcommand (same flags as cloud-perf):
+#
+#   benchmarks suite integration-tests/benchmark_suites/cloud-density-saturation.yaml \
+#     --save-to-json result.json \
+#     cloud --api-url https://<host> --apps-base-domain <domain> \
+#       --admin-account-token <token> --builtin-plugin-owner-account-id <uuid> \
+#       --default-plan-id <uuid> --component-directory <path-to-wasm-components>
+#
+# Reading the result: plot `saturation-throughput-ops-per-sec` and
+# invocation-retries/timeouts against `size`. Throughput climbs with `size`
+# until the pod's memory is exhausted, then plateaus or drops while retries and
+# eviction churn rise — that knee is the active-agent ceiling.
+#
+# `clusterSize` is ignored in cloud mode (single observed cluster).
+
+name: cloud-density-saturation
+benchmarks:
+  # Rust echo agents — lean per-instance linear memory (the ~900 KB module is
+  # charged once per component, shared across all agents; what scales per agent
+  # is the small instance heap). Knee expected in the thousands, so sweep high.
+  # The current admission algorithm craters around ~700 agents/node; the reworked
+  # admission is expected to push this knee substantially higher.
+  - name: throughput-saturation-echo-rust
+    iterations: 3
+    clusterSize: [2]
+    size: [500, 1000, 2000, 4000, 8000, 12000]
+    length: [0]
+
+  # TypeScript echo agents — each instance instantiates its own QuickJS runtime
+  # and JS heap in its own linear memory (the 17.4 MB module is shared once per
+  # component; the per-instance runtime state is the heavy per-agent cost).
+  # Expect the knee in the hundreds, well below the Rust variant.
+  - name: throughput-saturation-echo-ts
+    iterations: 3
+    clusterSize: [2]
+    size: [100, 250, 500, 750, 1000, 1500, 2000]
+    length: [0]
+
+  # Synthetic footprint — each agent retains a deterministic per-agent-distinct
+  # amount of resident memory, exercising the admission/eviction path with a
+  # controllable footprint near the limit.
+  # size   = number of active, memory-holding agents (the ramp axis)
+  # length = base per-agent memory footprint in bytes; each agent retains a
+  #          deterministic multiple (1x..8x), averaging ~4.5x. 4 MiB base =>
+  #          ~18 MiB average per agent, filling a ~10 GiB usable pool around
+  #          ~580 agents (bracketing the old ~700 crater point).
+  - name: throughput-saturation-counters
+    iterations: 3
+    clusterSize: [2]
+    size: [100, 250, 500, 750, 1000, 1500]
+    length: [4194304]
diff --git a/integration-tests/src/benchmarks/all.rs b/integration-tests/src/benchmarks/all.rs
index 9b1efd1eb2..e79ac78612 100644
--- a/integration-tests/src/benchmarks/all.rs
+++ b/integration-tests/src/benchmarks/all.rs
@@ -133,6 +133,30 @@ async fn main() {
             >(mode, verbosity, item, primary_only, otlp))
         }),
     );
+    benchmarks_by_name.insert(
+        "throughput-saturation-counters",
+        Box::new(|mode, verbosity, item, primary_only, otlp| {
+            Box::pin(run_benchmark::<
+                integration_tests::benchmarks::throughput_saturation::ThroughputSaturationCounters,
+            >(mode, verbosity, item, primary_only, otlp))
+        }),
+    );
+    benchmarks_by_name.insert(
+        "throughput-saturation-echo-rust",
+        Box::new(|mode, verbosity, item, primary_only, otlp| {
+            Box::pin(run_benchmark::<
+                integration_tests::benchmarks::throughput_saturation::ThroughputSaturationEchoRust,
+            >(mode, verbosity, item, primary_only, otlp))
+        }),
+    );
+    benchmarks_by_name.insert(
+        "throughput-saturation-echo-ts",
+        Box::new(|mode, verbosity, item, primary_only, otlp| {
+            Box::pin(run_benchmark::<
+                integration_tests::benchmarks::throughput_saturation::ThroughputSaturationEchoTs,
+            >(mode, verbosity, item, primary_only, otlp))
+        }),
+    );
 
     let params = BenchmarkCliParameters::parse_from(std::env::args_os());
     let tracer_provider = BenchmarkTestDependencies::init_logging(&params);
diff --git a/integration-tests/src/benchmarks/mod.rs b/integration-tests/src/benchmarks/mod.rs
index d1651f063f..0682055643 100644
--- a/integration-tests/src/benchmarks/mod.rs
+++ b/integration-tests/src/benchmarks/mod.rs
@@ -35,6 +35,7 @@ pub mod durability_overhead;
 pub mod latency;
 pub mod sleep;
 pub mod throughput;
+pub mod throughput_saturation;
 
 // Re-export cleanup helpers so callers can use the flat `benchmarks::*` path.
 pub use cleanup::{cleanup_account, cleanup_env_and_app, cleanup_user_state};
diff --git a/integration-tests/src/benchmarks/throughput_saturation.rs b/integration-tests/src/benchmarks/throughput_saturation.rs
new file mode 100644
index 0000000000..665ff3ec48
--- /dev/null
+++ b/integration-tests/src/benchmarks/throughput_saturation.rs
@@ -0,0 +1,423 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Throughput-under-memory-saturation benchmarks.
+//!
+//! Unlike the regular throughput benchmark — which keeps `size` small enough
+//! that all workers fit comfortably in memory — these benchmarks deliberately
+//! ramp the number of *active* agents up to and past the executor's memory
+//! ceiling, to find the knee: the agent count where the pod can still keep
+//! everything resident (latency flat, throughput scaling linearly) just before
+//! it starts evicting and replaying (latency spikes, throughput craters).
+//!
+//! The measured `run` phase drives sustained load over a fixed window: each
+//! agent repeatedly does a short unit of work then goes idle for [`IDLE_GAP`].
+//! During that gap the agent has no in-flight work and becomes a `LoadedIdle`
+//! eviction candidate, so under memory pressure it can be evicted and then must
+//! reload (oplog replay + re-admission) on its next call — the churn that makes
+//! throughput crater past the knee. Starts are staggered so the fleet is not
+//! synchronised.
+//!
+//! Three variants:
+//! - `throughput-saturation-counters`: agent-counters with a synthetic,
+//!   per-agent-distinct retained footprint (`allocate_memory`) plus CPU work
+//!   (`busy_for`). The footprint is controllable via `length`.
+//! - `throughput-saturation-echo-rust` / `throughput-saturation-echo-ts`: the
+//!   benchmark `echo` agent (Rust / TS) called repeatedly. No synthetic
+//!   footprint — the per-agent memory is the agent's natural footprint, which
+//!   for the TS agent includes the QuickJS runtime. Answers "how many actively
+//!   invoked echo agents fit per pod".
+//!
+//! Parameters:
+//! - `size`   = number of active agents in this step (the ramp axis).
+//! - `length` = for the counters variant, the base per-agent memory footprint in
+//!   bytes (agent `i` retains a deterministic multiple); ignored by the echo
+//!   variants.
+
+use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent};
+use async_trait::async_trait;
+use futures_concurrency::future::Join;
+use golem_common::base_model::agent::{DataValue, ParsedAgentId};
+use golem_common::model::AgentId;
+use golem_common::model::component::ComponentDto;
+use golem_common::model::environment::EnvironmentId;
+use golem_common::{agent_id, data_value};
+use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, ResultKey, RunConfig};
+use golem_test_framework::config::benchmark::TestMode;
+use golem_test_framework::config::dsl_impl::TestUserContext;
+use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies};
+use golem_test_framework::dsl::{TestDsl, TestDslExtended};
+use indoc::indoc;
+use std::time::{Duration, Instant};
+use tracing::{Instrument, Level, info};
+
+/// Number of distinct footprint buckets the synthetic per-agent memory spread
+/// cycles through, so the fleet holds a mix of sizes rather than a uniform
+/// amount.
+const SPREAD_BUCKETS: usize = 8;
+
+/// CPU busy time (ms) per `busy_for` invocation (counters variant only).
+const BUSY_MILLIS: u32 = 50;
+
+/// Idle gap each agent sleeps between calls. During this gap the agent has no
+/// in-flight work and becomes a `LoadedIdle` eviction candidate. Under memory
+/// pressure it may be evicted and then must reload on its next call — the churn
+/// this benchmark exists to measure.
+const IDLE_GAP: Duration = Duration::from_millis(200);
+
+/// Total measured wall-clock duration of the sustained-load phase. Throughput
+/// and churn are measured over this fixed window so steps with different `size`
+/// are comparable.
+const RUN_DURATION: Duration = Duration::from_secs(30);
+
+/// Maximum per-agent start stagger, so the fleet is not synchronised: at any
+/// instant some agents are mid-call (demanding memory) while others sit idle
+/// (evictable).
+const MAX_STAGGER: Duration = Duration::from_millis(250);
+
+/// Resident memory (bytes) the synthetic-footprint agent `index` retains for a
+/// given `base`. Spreads deterministically across [`SPREAD_BUCKETS`] buckets
+/// (`base * 1` .. `base * SPREAD_BUCKETS`) so different agents hold different
+/// amounts and some sit much closer to the limit than others.
+fn agent_memory_bytes(index: usize, base: usize) -> u32 {
+    let bucket = (index % SPREAD_BUCKETS) + 1;
+    (base.saturating_mul(bucket)).min(u32::MAX as usize) as u32
+}
+
+/// Per-agent start offset derived deterministically from the index, spread
+/// across `[0, MAX_STAGGER)`.
+fn agent_stagger(index: usize) -> Duration {
+    let frac = (index as u32).wrapping_mul(2_654_435_761) % 1000;
+    MAX_STAGGER.checked_mul(frac).unwrap_or_default() / 1000
+}
+
+/// Describes one saturation variant: which component to load, which agent type
+/// and method to actively invoke, and whether to pre-load a synthetic footprint.
+struct SaturationVariant {
+    /// WASM file name (without `.wasm`) in the component directory.
+    wasm_name: &'static str,
+    /// Registry display name for the component.
+    component_name: &'static str,
+    /// Agent type to instantiate.
+    agent_type: &'static str,
+    /// Method invoked repeatedly during the measured phase.
+    active_method: &'static str,
+    /// Builds the parameter for one `active_method` call.
+    active_params: fn() -> DataValue,
+    /// When set, each agent calls this method once in warmup with its
+    /// deterministic footprint (`allocate_memory`-style). `None` for the echo
+    /// variants, whose footprint is the agent's natural memory.
+    allocate_method: Option<&'static str>,
+}
+
+const COUNTERS_VARIANT: SaturationVariant = SaturationVariant {
+    wasm_name: "it_agent_counters_release",
+    component_name: "it:agent-counters",
+    agent_type: "Counter",
+    active_method: "busy-for",
+    active_params: || data_value!(BUSY_MILLIS),
+    allocate_method: Some("allocate-memory"),
+};
+
+const ECHO_RUST_VARIANT: SaturationVariant = SaturationVariant {
+    wasm_name: "benchmark_agent_rust_release",
+    component_name: "benchmark:agent-rust",
+    agent_type: "RustBenchmarkAgent",
+    active_method: "echo",
+    active_params: || data_value!("saturation"),
+    allocate_method: None,
+};
+
+const ECHO_TS_VARIANT: SaturationVariant = SaturationVariant {
+    wasm_name: "benchmark_agent_ts",
+    component_name: "benchmark:agent-ts",
+    agent_type: "BenchmarkAgent",
+    active_method: "echo",
+    active_params: || data_value!("saturation"),
+    allocate_method: None,
+};
+
+pub struct SaturationBenchmarkContext {
+    deps: BenchmarkTestDependencies,
+}
+
+pub struct SaturationIterationContext {
+    user: TestUserContext<BenchmarkTestDependencies>,
+    component: ComponentDto,
+    agent_ids: Vec<ParsedAgentId>,
+    base_memory_bytes: usize,
+    env_id: EnvironmentId,
+}
+
+/// Shared implementation for all saturation variants. The variant-specific
+/// config is supplied by the wrapper types' `variant()`.
+async fn create_context(
+    mode: &TestMode,
+    verbosity: Level,
+    cluster_size: usize,
+    disable_compilation_cache: bool,
+    otlp: bool,
+) -> SaturationBenchmarkContext {
+    SaturationBenchmarkContext {
+        deps: BenchmarkTestDependencies::new(
+            mode,
+            verbosity,
+            cluster_size,
+            disable_compilation_cache,
+            otlp,
+        )
+        .await,
+    }
+}
+
+async fn setup_iteration(
+    variant: &SaturationVariant,
+    config: &RunConfig,
+    benchmark_context: &SaturationBenchmarkContext,
+) -> SaturationIterationContext {
+    let user = benchmark_context.deps.user().await.unwrap();
+    let (_, env) = user.app_and_env().await.unwrap();
+
+    info!("Registering component {}", variant.component_name);
+    let component = user
+        .component(&env.id, variant.wasm_name)
+        .name(variant.component_name)
+        .store()
+        .await
+        .unwrap();
+
+    let mut agent_ids = vec![];
+    for n in 0..config.size {
+        agent_ids.push(agent_id!(variant.agent_type, format!("saturation-{n}")));
+    }
+
+    SaturationIterationContext {
+        user,
+        component,
+        agent_ids,
+        base_memory_bytes: config.length,
+        env_id: env.id,
+    }
+}
+
+async fn warmup(variant: &SaturationVariant, context: &SaturationIterationContext) {
+    let Some(allocate_method) = variant.allocate_method else {
+        // Echo variants: nothing to pre-load; the agent's natural footprint is
+        // established on first invocation.
+        return;
+    };
+
+    async {
+        let base = context.base_memory_bytes;
+        let result_futures = context
+            .agent_ids
+            .iter()
+            .enumerate()
+            .map(move |(idx, agent_id)| async move {
+                let user_clone = context.user.clone();
+                let bytes = agent_memory_bytes(idx, base);
+                invoke_and_await_agent(
+                    &user_clone,
+                    &context.component,
+                    agent_id,
+                    allocate_method,
+                    data_value!(bytes),
+                )
+                .await
+            })
+            .collect::<Vec<_>>();
+        let _ = result_futures.join().await;
+    }
+    .instrument(tracing::info_span!(
+        "warmup_allocate_memory",
+        agent_count = context.agent_ids.len()
+    ))
+    .await;
+}
+
+async fn run(
+    variant: &SaturationVariant,
+    context: &SaturationIterationContext,
+    recorder: BenchmarkRecorder,
+) {
+    let agent_count = context.agent_ids.len();
+    let deadline = Instant::now() + RUN_DURATION;
+
+    let result_futures = context
+        .agent_ids
+        .iter()
+        .enumerate()
+        .map(|(idx, agent_id)| {
+            let recorder = recorder.clone();
+            async move {
+                let user_clone = context.user.clone();
+
+                tokio::time::sleep(agent_stagger(idx)).await;
+
+                let mut calls = 0u64;
+                while Instant::now() < deadline {
+                    let result = invoke_and_await_agent(
+                        &user_clone,
+                        &context.component,
+                        agent_id,
+                        variant.active_method,
+                        (variant.active_params)(),
+                    )
+                    .await;
+                    result.record(&recorder, "", idx.to_string().as_str());
+                    calls += 1;
+                    tokio::time::sleep(IDLE_GAP).await;
+                }
+                calls
+            }
+        })
+        .collect::<Vec<_>>();
+
+    let started = Instant::now();
+    let per_agent_calls = result_futures.join().await;
+    let elapsed = started.elapsed();
+
+    // Aggregate sustained throughput over the fixed run window. Across `size`
+    // steps, this reveals where added active agents stop adding throughput
+    // (memory saturation / eviction churn dominates) — the knee we are after.
+    let total_calls: u64 = per_agent_calls.iter().sum();
+    let secs = elapsed.as_secs_f64();
+    if secs > 0.0 {
+        let ops_per_sec = (total_calls as f64 / secs).round() as u64;
+        info!(
+            "saturation: {agent_count} agents, {total_calls} calls in {secs:.1}s = {ops_per_sec} ops/sec"
+        );
+        recorder.count(
+            &ResultKey::primary("saturation-throughput-ops-per-sec"),
+            ops_per_sec,
+        );
+    }
+}
+
+async fn cleanup_iteration(context: SaturationIterationContext) {
+    let agent_ids: Vec<AgentId> = context
+        .agent_ids
+        .iter()
+        .filter_map(|agent_id| AgentId::from_agent_id(context.component.id, agent_id).ok())
+        .collect();
+    delete_workers(&context.user, &agent_ids).await;
+    cleanup_user_state(&context.user, &context.env_id).await;
+}
+
+/// Generates a `Benchmark` impl wrapper for a saturation variant.
+macro_rules! saturation_benchmark {
+    ($ty:ident, $bench_name:literal, $variant:expr, $description:literal) => {
+        pub struct $ty {
+            config: RunConfig,
+        }
+
+        #[async_trait]
+        impl Benchmark for $ty {
+            type BenchmarkContext = SaturationBenchmarkContext;
+            type IterationContext = SaturationIterationContext;
+
+            fn name() -> &'static str {
+                $bench_name
+            }
+
+            fn description() -> &'static str {
+                indoc! { $description }
+            }
+
+            async fn create_benchmark_context(
+                mode: &TestMode,
+                verbosity: Level,
+                cluster_size: usize,
+                disable_compilation_cache: bool,
+                otlp: bool,
+            ) -> Self::BenchmarkContext {
+                create_context(
+                    mode,
+                    verbosity,
+                    cluster_size,
+                    disable_compilation_cache,
+                    otlp,
+                )
+                .await
+            }
+
+            async fn cleanup(benchmark_context: Self::BenchmarkContext) {
+                benchmark_context.deps.kill_all().await;
+            }
+
+            async fn create(_mode: &TestMode, config: RunConfig) -> Self {
+                Self { config }
+            }
+
+            async fn setup_iteration(
+                &self,
+                benchmark_context: &Self::BenchmarkContext,
+            ) -> Self::IterationContext {
+                setup_iteration(&$variant, &self.config, benchmark_context).await
+            }
+
+            async fn warmup(
+                &self,
+                _benchmark_context: &Self::BenchmarkContext,
+                context: &Self::IterationContext,
+            ) {
+                warmup(&$variant, context).await
+            }
+
+            async fn run(
+                &self,
+                _benchmark_context: &Self::BenchmarkContext,
+                context: &Self::IterationContext,
+                recorder: BenchmarkRecorder,
+            ) {
+                run(&$variant, context, recorder).await
+            }
+
+            async fn cleanup_iteration(
+                &self,
+                _benchmark_context: &Self::BenchmarkContext,
+                context: Self::IterationContext,
+            ) {
+                cleanup_iteration(context).await
+            }
+        }
+    };
+}
+
+saturation_benchmark!(
+    ThroughputSaturationCounters,
+    "throughput-saturation-counters",
+    COUNTERS_VARIANT,
+    "Ramps `size` active agents that each retain a deterministic, per-agent-distinct
+    synthetic memory footprint (controlled by `length`) and do CPU work, measuring
+    sustained throughput to locate the memory-saturation knee."
+);
+
+saturation_benchmark!(
+    ThroughputSaturationEchoRust,
+    "throughput-saturation-echo-rust",
+    ECHO_RUST_VARIANT,
+    "Ramps `size` actively-invoked Rust `echo` agents to find how many fit resident
+    per pod before eviction churn craters throughput. The per-agent footprint is the
+    agent's natural memory (no synthetic allocation)."
+);
+
+saturation_benchmark!(
+    ThroughputSaturationEchoTs,
+    "throughput-saturation-echo-ts",
+    ECHO_TS_VARIANT,
+    "Ramps `size` actively-invoked TypeScript `echo` agents to find how many fit
+    resident per pod before eviction churn craters throughput. The per-agent
+    footprint is the agent's natural memory, including the QuickJS runtime."
+);
diff --git a/test-components/agent-counters/src/lib.rs b/test-components/agent-counters/src/lib.rs
index b2ac7d4d44..b14840512d 100644
--- a/test-components/agent-counters/src/lib.rs
+++ b/test-components/agent-counters/src/lib.rs
@@ -3,6 +3,43 @@ pub mod repository;
 
 use golem_rust::{agent_definition, agent_implementation, generate_idempotency_key};
 
+/// Page size used when touching retained memory so the OS backs it with real
+/// resident pages rather than leaving it as untouched (non-resident) reservation.
+const PAGE_SIZE: usize = 4096;
+
+/// Spins doing cheap arithmetic for approximately `millis` milliseconds, polling
+/// the monotonic clock between batches of work rather than on every iteration so
+/// the workload is CPU-bound, not clock-syscall-bound. Returns an accumulated
+/// value so the work cannot be optimised away.
+fn busy_loop(millis: u32) -> u32 {
+    let deadline = std::time::Duration::from_millis(millis as u64);
+    let start = std::time::Instant::now();
+    let mut acc: u32 = 0;
+    loop {
+        for i in 0..10_000u32 {
+            acc = acc.wrapping_add(i).wrapping_mul(31).wrapping_add(7);
+        }
+        if start.elapsed() >= deadline {
+            break;
+        }
+    }
+    acc
+}
+
+/// Grows `buffer` to hold `bytes` and touches one byte per page so the memory
+/// becomes resident (real RSS), not just reserved address space.
+fn retain_memory(buffer: &mut Vec<u8>, bytes: u32) {
+    let bytes = bytes as usize;
+    buffer.clear();
+    buffer.shrink_to_fit();
+    buffer.resize(bytes, 0);
+    let mut page = 0;
+    while page < bytes {
+        buffer[page] = buffer[page].wrapping_add(1);
+        page += PAGE_SIZE;
+    }
+}
+
 #[agent_definition]
 trait Counter {
     fn new(id: String) -> Self;
@@ -10,17 +47,32 @@ trait Counter {
     async fn increment_through_rpc(&mut self) -> u32;
     async fn increment_through_rpc_to_ephemeral(&mut self) -> u32;
     async fn increment_through_rpc_to_ephemeral_phantom(&mut self) -> u32;
+
+    /// Spins for `millis` milliseconds of cheap CPU work, then increments and
+    /// returns the counter. Used to define an "active" agent without making the
+    /// workload oplog-bound on a tight loop.
+    fn busy_for(&mut self, millis: u32) -> u32;
+
+    /// Retains `bytes` of resident linear memory in the agent's state and
+    /// increments the counter. The memory stays resident across invocations so
+    /// the agent contributes a controllable footprint to the executor's pool.
+    fn allocate_memory(&mut self, bytes: u32) -> u32;
 }
 
 struct CounterImpl {
     count: u32,
     id: String,
+    retained: Vec<u8>,
 }
 
 #[agent_implementation]
 impl Counter for CounterImpl {
     fn new(id: String) -> Self {
-        Self { id, count: 0 }
+        Self {
+            id,
+            count: 0,
+            retained: Vec::new(),
+        }
     }
 
     fn increment(&mut self) -> u32 {
@@ -42,29 +94,64 @@ impl Counter for CounterImpl {
         let mut client = EphemeralSingletonCounterClient::new_phantom();
         client.increment().await
     }
+
+    fn busy_for(&mut self, millis: u32) -> u32 {
+        let _ = busy_loop(millis);
+        self.count += 1;
+        self.count
+    }
+
+    fn allocate_memory(&mut self, bytes: u32) -> u32 {
+        retain_memory(&mut self.retained, bytes);
+        self.count += 1;
+        self.count
+    }
 }
 
 #[agent_definition(ephemeral)]
 trait EphemeralCounter {
     fn new(id: String) -> Self;
     fn increment(&mut self) -> u32;
+
+    /// See [`Counter::busy_for`].
+    fn busy_for(&mut self, millis: u32) -> u32;
+
+    /// See [`Counter::allocate_memory`].
+    fn allocate_memory(&mut self, bytes: u32) -> u32;
 }
 
 struct EphemeralCounterImpl {
     count: u32,
     _id: String,
+    retained: Vec<u8>,
 }
 
 #[agent_implementation]
 impl EphemeralCounter for EphemeralCounterImpl {
     fn new(id: String) -> Self {
-        Self { _id: id, count: 0 }
+        Self {
+            _id: id,
+            count: 0,
+            retained: Vec::new(),
+        }
     }
 
     fn increment(&mut self) -> u32 {
         self.count += 1;
         self.count
     }
+
+    fn busy_for(&mut self, millis: u32) -> u32 {
+        let _ = busy_loop(millis);
+        self.count += 1;
+        self.count
+    }
+
+    fn allocate_memory(&mut self, bytes: u32) -> u32 {
+        retain_memory(&mut self.retained, bytes);
+        self.count += 1;
+        self.count
+    }
 }
 
 

From bfe1b145b5561aa6613b629f58e8969bbacdfd0f Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 00:38:46 -0700
Subject: [PATCH 24/60] test(worker-executor): exercise admission reserve under
 maximum concurrent overlap

---
 .../active_workers/admission/tests.rs         | 172 +++++++++++++-----
 1 file changed, 122 insertions(+), 50 deletions(-)

diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs
index bd9b51aabb..50d545ddb0 100644
--- a/golem-worker-executor/src/services/active_workers/admission/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs
@@ -435,72 +435,144 @@ async fn usable_ratio_caps_admission_below_full_limit() {
     );
 }
 
-// ── Concurrency: the simultaneous-big-start race ─────────────────────────────
+// ── Concurrency ──────────────────────────────────────────────────────────────
+//
+// In production, each admission reads headroom (`try_admit`) and then separately
+// commits to the upstream atomic permit (modeled here as `pinned_usage +=
+// request`). The two steps are not serialised across concurrent admissions, so
+// several admissions can read the same pre-commit snapshot, all pass the check,
+// and all commit. The `reserve` margin accounts for this instead of a lock:
+// concurrent admissions may push usage above the carve-out ceiling into the
+// reserve, but must not push it above the true `limit`.
+//
+// These tests force the maximum-overlap case with a barrier: every admission
+// completes its headroom check before any admission commits. This makes the
+// maximum overshoot deterministic rather than dependent on task scheduling, so
+// an undersized reserve is reliably detected and a correctly sized one is
+// actually exercised.
+
+/// Run `racers` admissions of `request` bytes against a fresh environment with
+/// the given `reserve`, forcing all headroom checks to complete before any
+/// commit (maximum overlap). Returns the final environment usage and the number
+/// of admits granted.
+async fn race_admissions_worst_case(
+    limit: u64,
+    initial_pinned: u64,
+    reserve: u64,
+    racers: usize,
+    request: u64,
+) -> (u64, usize) {
+    let state = Arc::new(Mutex::new(EnvState {
+        limit,
+        pinned_usage: initial_pinned,
+        residents: vec![],
+        ..Default::default()
+    }));
+    let ctrl = Arc::new(controller_with_ratio(state.clone(), 1.0, reserve));
+    // All racers check before any commits: the maximum-overlap schedule.
+    let barrier = Arc::new(tokio::sync::Barrier::new(racers));
+
+    let mut handles = Vec::new();
+    for _ in 0..racers {
+        let ctrl = ctrl.clone();
+        let state = state.clone();
+        let barrier = barrier.clone();
+        handles.push(tokio::spawn(async move {
+            let source = FakeEvictionSource {
+                state: state.clone(),
+            };
+            let decision = ctrl.try_admit(request, &source).await;
+            // Hold every racer here until all have decided against the same
+            // pre-commit snapshot, then let the commits run together.
+            barrier.wait().await;
+            if decision == AdmissionDecision::Admit {
+                state.lock().unwrap().pinned_usage += request;
+                true
+            } else {
+                false
+            }
+        }));
+    }
+    let mut admitted = 0;
+    for h in handles {
+        if h.await.unwrap() {
+            admitted += 1;
+        }
+    }
+    let usage = state.lock().unwrap().usage();
+    (usage, admitted)
+}
 
 proptest! {
-    /// The contract for the safety invariant under concurrency.
-    ///
-    /// Many admissions race at once with no external serialisation across the
-    /// headroom check and the commit (the commit models the upstream atomic
-    /// permit grant; the check is a separate prior read, so a genuine
-    /// time-of-check/time-of-use window exists between concurrent tasks).
+    /// A reserve sized for the maximum concurrent overshoot keeps real usage
+    /// under the limit even when every racer checks before any commits, with a
+    /// non-trivial near-ceiling pinned base.
     ///
-    /// The invariant: real usage must never exceed the true `limit`. Admissions
-    /// may collectively overshoot the carve-out ceiling into the reserve — that
-    /// is what the reserve is for — but never past `limit` itself. The reserve
-    /// is sized here to cover the worst-case concurrent overshoot (number of
-    /// racers × max request), so a passing test means the reserve margin is a
-    /// sufficient substitute for serialising the gate. If this ever fails, the
-    /// margin is insufficient for the chosen concurrency and the gate's
-    /// correctness depends on stronger synchronisation.
+    /// Sizing: at most all `racers` can pass against the same pre-commit
+    /// snapshot, so the reserve must cover `racers × request` landing in the
+    /// window between check and commit. With that margin, usage stays
+    /// `<= limit`.
     #[test]
-    fn concurrent_admissions_never_exceed_limit(
+    fn sufficient_reserve_holds_under_worst_case_overlap(
         racers in 2usize..16,
         request in 50u64..400,
+        base_fill in 0u64..2000,
     ) {
-        // Worst case: every racer passes the check against the same snapshot and
-        // commits. The reserve must cover (racers - 1) extra in-flight requests
-        // beyond the one the headroom was actually sized for.
         let reserve = request * racers as u64;
-        // Ceiling must leave room for at least one request above the reserve.
-        let limit = reserve + request + 1000;
+        // Limit leaves room for the pre-existing fill, the reserve, and at least
+        // one request's worth of admissible headroom above the reserve.
+        let limit = base_fill + reserve + request + 500;
 
         let rt = tokio::runtime::Builder::new_multi_thread()
             .worker_threads(4)
             .build()
             .unwrap();
         rt.block_on(async move {
-            let state = Arc::new(Mutex::new(EnvState {
-                limit,
-                pinned_usage: 0,
-                residents: vec![],
-                ..Default::default()
-            }));
-            let ctrl = Arc::new(controller_with_ratio(state.clone(), 1.0, reserve));
-
-            let mut handles = Vec::new();
-            for _ in 0..racers {
-                let ctrl = ctrl.clone();
-                let state = state.clone();
-                handles.push(tokio::spawn(async move {
-                    let source = FakeEvictionSource { state: state.clone() };
-                    let decision = ctrl.try_admit(request, &source).await;
-                    if decision == AdmissionDecision::Admit {
-                        // Models the atomic permit grant: a single locked
-                        // fetch-add, separate from the (already-completed) check.
-                        state.lock().unwrap().pinned_usage += request;
-                    }
-                }));
-            }
-            for h in handles {
-                h.await.unwrap();
-            }
+            let (usage, _) =
+                race_admissions_worst_case(limit, base_fill, reserve, racers, request).await;
+            prop_assert!(
+                usage <= limit,
+                "maximum overlap drove usage {usage} past limit {limit}"
+            );
+            Ok(())
+        }).unwrap();
+    }
 
-            let s = state.lock().unwrap();
+    /// With no reserve and maximum overlap forced, several racers admitting at
+    /// once must push usage above the carve-out ceiling. This confirms the race
+    /// the design tolerates is real and this harness reproduces it; without it,
+    /// the safety test above could pass without ever exercising a concurrent
+    /// overshoot. Usage may still stay under `limit`; the assertion is on the
+    /// overshoot past the ceiling.
+    #[test]
+    fn worst_case_overlap_overshoots_ceiling_without_reserve(
+        racers in 2usize..12,
+        request in 50u64..400,
+    ) {
+        // Ceiling headroom sized for exactly one request; no reserve cushion.
+        let ceiling = request;
+        let limit = request * racers as u64 + 1000;
+
+        let rt = tokio::runtime::Builder::new_multi_thread()
+            .worker_threads(4)
+            .build()
+            .unwrap();
+        rt.block_on(async move {
+            // pinned = limit - ceiling so admissible headroom is exactly one
+            // request; with reserve 0, every racer sees room for itself.
+            let pinned = limit - ceiling;
+            let (usage, admitted) =
+                race_admissions_worst_case(limit, pinned, 0, racers, request).await;
+            // More than one admit means the gate let concurrent racers through
+            // on the same snapshot.
+            prop_assert!(
+                admitted >= 2,
+                "expected concurrent over-admission with no reserve, got {admitted} admits"
+            );
             prop_assert!(
-                s.usage() <= s.limit,
-                "concurrent admissions drove usage {} past limit {}",
-                s.usage(), s.limit
+                usage > ceiling + pinned,
+                "usage {usage} did not overshoot the ceiling {}",
+                ceiling + pinned
             );
             Ok(())
         }).unwrap();

From c3af739848f079bb71bae2bdce482301894358a3 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 09:08:18 -0700
Subject: [PATCH 25/60] feat(benchmark): longer sustained load, bumpt the
 number of agents

---
 .../cloud-density-saturation.yaml             | 48 ++++++++++---------
 .../src/benchmarks/throughput_saturation.rs   |  6 ++-
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
index 59d1409efc..c877993cc6 100644
--- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml
+++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
@@ -28,37 +28,41 @@
 
 name: cloud-density-saturation
 benchmarks:
+  # Synthetic footprint — each agent retains a deterministic per-agent-distinct
+  # amount of resident memory, exercising the admission/eviction path with a
+  # controllable footprint near the limit. Run first: this is the variant that
+  # actually fills memory and drives the gate to its reject/evict path.
+  # size   = number of active, memory-holding agents (the ramp axis)
+  # length = base per-agent memory footprint in bytes; each agent retains a
+  #          deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base =>
+  #          ~72 MiB average per agent, filling a ~10 GiB usable pool around
+  #          ~145 agents. The sweep brackets that ceiling and pushes well past it
+  #          so the admission gate's reject/evict behaviour near OOM is exercised.
+  - name: throughput-saturation-counters
+    iterations: 1
+    clusterSize: [2]
+    size: [50, 100, 150, 200, 300, 500]
+    length: [16777216]
+
   # Rust echo agents — lean per-instance linear memory (the ~900 KB module is
   # charged once per component, shared across all agents; what scales per agent
-  # is the small instance heap). Knee expected in the thousands, so sweep high.
-  # The current admission algorithm craters around ~700 agents/node; the reworked
-  # admission is expected to push this knee substantially higher.
+  # is the small instance heap). The previous run reached the top of the sweep
+  # (12000) without saturating pod memory, so the knee here is throughput /
+  # eviction-churn rather than memory. Dropped the low points that told us
+  # nothing and pushed the range up with coarser steps.
   - name: throughput-saturation-echo-rust
-    iterations: 3
+    iterations: 1
     clusterSize: [2]
-    size: [500, 1000, 2000, 4000, 8000, 12000]
+    size: [4000, 8000, 16000, 24000, 32000]
     length: [0]
 
   # TypeScript echo agents — each instance instantiates its own QuickJS runtime
   # and JS heap in its own linear memory (the 17.4 MB module is shared once per
   # component; the per-instance runtime state is the heavy per-agent cost).
-  # Expect the knee in the hundreds, well below the Rust variant.
+  # Heavier per agent than the Rust variant, so a lower knee — but the previous
+  # run reached 2000 without saturating, so push higher and drop the low points.
   - name: throughput-saturation-echo-ts
-    iterations: 3
+    iterations: 1
     clusterSize: [2]
-    size: [100, 250, 500, 750, 1000, 1500, 2000]
+    size: [1000, 2000, 4000, 6000, 8000]
     length: [0]
-
-  # Synthetic footprint — each agent retains a deterministic per-agent-distinct
-  # amount of resident memory, exercising the admission/eviction path with a
-  # controllable footprint near the limit.
-  # size   = number of active, memory-holding agents (the ramp axis)
-  # length = base per-agent memory footprint in bytes; each agent retains a
-  #          deterministic multiple (1x..8x), averaging ~4.5x. 4 MiB base =>
-  #          ~18 MiB average per agent, filling a ~10 GiB usable pool around
-  #          ~580 agents (bracketing the old ~700 crater point).
-  - name: throughput-saturation-counters
-    iterations: 3
-    clusterSize: [2]
-    size: [100, 250, 500, 750, 1000, 1500]
-    length: [4194304]
diff --git a/integration-tests/src/benchmarks/throughput_saturation.rs b/integration-tests/src/benchmarks/throughput_saturation.rs
index 665ff3ec48..44568614a9 100644
--- a/integration-tests/src/benchmarks/throughput_saturation.rs
+++ b/integration-tests/src/benchmarks/throughput_saturation.rs
@@ -78,8 +78,10 @@ const IDLE_GAP: Duration = Duration::from_millis(200);
 
 /// Total measured wall-clock duration of the sustained-load phase. Throughput
 /// and churn are measured over this fixed window so steps with different `size`
-/// are comparable.
-const RUN_DURATION: Duration = Duration::from_secs(30);
+/// are comparable. Held long enough that the high-residency plateau persists for
+/// at least a minute, so steady-state behaviour at the memory ceiling (not just
+/// the initial burst) is observed.
+const RUN_DURATION: Duration = Duration::from_secs(90);
 
 /// Maximum per-agent start stagger, so the fleet is not synchronised: at any
 /// instant some agents are mid-call (demanding memory) while others sit idle

From 7dcb2d3d4d39030b72a074945bd1cbcb2bafa000 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 09:36:17 -0700
Subject: [PATCH 26/60] fix: add empty workspace

---
 test-components/agent-counters/Cargo.toml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test-components/agent-counters/Cargo.toml b/test-components/agent-counters/Cargo.toml
index c7567da5a5..069f9180f3 100644
--- a/test-components/agent-counters/Cargo.toml
+++ b/test-components/agent-counters/Cargo.toml
@@ -3,6 +3,12 @@ name = "it_agent_counters"
 version = "0.0.1"
  edition = "2024"
 
+# Standalone workspace root: this component is excluded from the golem-oss
+# workspace, and when built nested inside another repo's workspace (e.g. the
+# cloud-perf CI checkout under golem-cloud) cargo would otherwise walk up and
+# attach it to that unrelated workspace. An empty table stops that search.
+[workspace]
+
 [profile.release]
 opt-level = "s"
 lto = true

From 139aed5535a36355009627d6a36d125708cf16cb Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 09:59:07 -0700
Subject: [PATCH 27/60] fix: use snake case as method names

---
 integration-tests/src/benchmarks/throughput_saturation.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/integration-tests/src/benchmarks/throughput_saturation.rs b/integration-tests/src/benchmarks/throughput_saturation.rs
index 44568614a9..768d8c7eb1 100644
--- a/integration-tests/src/benchmarks/throughput_saturation.rs
+++ b/integration-tests/src/benchmarks/throughput_saturation.rs
@@ -127,9 +127,9 @@ const COUNTERS_VARIANT: SaturationVariant = SaturationVariant {
     wasm_name: "it_agent_counters_release",
     component_name: "it:agent-counters",
     agent_type: "Counter",
-    active_method: "busy-for",
+    active_method: "busy_for",
     active_params: || data_value!(BUSY_MILLIS),
-    allocate_method: Some("allocate-memory"),
+    allocate_method: Some("allocate_memory"),
 };
 
 const ECHO_RUST_VARIANT: SaturationVariant = SaturationVariant {

From 442c1c5b0442a00a9d88197db927ab08462c3cfe Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 12:18:22 -0700
Subject: [PATCH 28/60] chore: 300 already saturates, no need for 500

---
 .../benchmark_suites/cloud-density-saturation.yaml              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
index c877993cc6..c6cc81c813 100644
--- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml
+++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
@@ -41,7 +41,7 @@ benchmarks:
   - name: throughput-saturation-counters
     iterations: 1
     clusterSize: [2]
-    size: [50, 100, 150, 200, 300, 500]
+    size: [50, 100, 150, 200, 300]
     length: [16777216]
 
   # Rust echo agents — lean per-instance linear memory (the ~900 KB module is

From 4bbb200ebb3bc8a5cb2ad9e680b23fca4dd4c50d Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 14:04:47 -0700
Subject: [PATCH 29/60] fix(worker-executor): avoid deadlock between memory
 grow and admission eviction scan

---
 .../active_workers/admission/tests.rs         | 163 ++++++++++++++++++
 golem-worker-executor/src/worker/mod.rs       |  38 ++--
 2 files changed, 191 insertions(+), 10 deletions(-)

diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs
index 50d545ddb0..8eca9e157c 100644
--- a/golem-worker-executor/src/services/active_workers/admission/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs
@@ -578,3 +578,166 @@ proptest! {
         }).unwrap();
     }
 }
+
+/// Concurrent memory grows must not deadlock against the admission eviction
+/// scan.
+///
+/// A memory grow acquires a permit while the growing worker holds its own
+/// instance lock, and the admission slow path scans the worker set, taking each
+/// other worker's instance lock to classify it for eviction. With many workers
+/// growing at once under memory pressure these two must not form an AB-BA cycle.
+/// Workloads that never grow memory never exercise this path.
+mod grow_lock_ordering {
+    use super::super::{AdmissionController, AdmissionPolicy, EvictionPriority, EvictionSource};
+    use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot};
+    use std::sync::Arc;
+    use std::time::Duration;
+    use test_r::test;
+    use tokio::sync::Mutex as AsyncMutex;
+
+    /// Per-worker lock, standing in for `Worker::instance`.
+    type WorkerLock = Arc<AsyncMutex<()>>;
+
+    /// Probe pinned to zero admissible headroom so `try_admit` takes the slow
+    /// (scanning) path, modelling the moment a grow's requested delta does not
+    /// fit the current headroom.
+    #[derive(Debug)]
+    struct SaturatedProbe;
+
+    impl MemoryProbe for SaturatedProbe {
+        fn snapshot(&self) -> MemorySnapshot {
+            MemorySnapshot {
+                limit_bytes: 1,
+                current_bytes: u64::MAX,
+            }
+        }
+    }
+
+    /// Probe reporting ample headroom so `try_admit` takes the fast path and
+    /// never scans — the same grow code path, but not under memory pressure.
+    #[derive(Debug)]
+    struct AmpleHeadroomProbe;
+
+    impl MemoryProbe for AmpleHeadroomProbe {
+        fn snapshot(&self) -> MemorySnapshot {
+            MemorySnapshot {
+                limit_bytes: u64::MAX,
+                current_bytes: 0,
+            }
+        }
+    }
+
+    /// Eviction source that, like `evict_at_most_memory`, scans every worker and
+    /// takes each worker's instance lock (via `eviction_class`) to classify it.
+    /// Frees nothing (all workers active). The lock on each worker is held only
+    /// briefly, faithfully — the deadlock comes from the ordering, not hold time.
+    struct ScanningEvictionSource {
+        workers: Vec<WorkerLock>,
+    }
+
+    #[async_trait::async_trait]
+    impl EvictionSource for ScanningEvictionSource {
+        async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 {
+            for worker in &self.workers {
+                let _guard = worker.lock().await;
+            }
+            0
+        }
+    }
+
+    /// Models the grow path's lock interaction: run the admission scan, which
+    /// takes other workers' instance locks, without holding this worker's own
+    /// instance lock, then take it afterwards to merge the permit (as
+    /// `Worker::increase_memory` does).
+    async fn grow_then_lock(
+        controller: &AdmissionController,
+        own: &WorkerLock,
+        workers: Vec<WorkerLock>,
+    ) {
+        let source = ScanningEvictionSource { workers };
+        controller.try_admit(1, &source).await;
+        let _own_guard = own.lock().await;
+    }
+
+    fn workers(n: usize) -> Vec<WorkerLock> {
+        (0..n).map(|_| Arc::new(AsyncMutex::new(()))).collect()
+    }
+
+    fn controller(probe: Box<dyn MemoryProbe>) -> Arc<AdmissionController> {
+        Arc::new(AdmissionController::new(
+            probe,
+            AdmissionPolicy {
+                usable_ratio: 1.0,
+                reserve_bytes: 0,
+            },
+        ))
+    }
+
+    /// Many workers growing concurrently under memory pressure (every grow takes
+    /// the scanning slow path) must all complete without deadlocking.
+    #[test(flavor = "multi_thread", worker_threads = 4)]
+    async fn concurrent_grows_do_not_deadlock_under_pressure() {
+        const WORKERS: usize = 32;
+        const DEADLINE: Duration = Duration::from_secs(10);
+
+        let workers = workers(WORKERS);
+        let controller = controller(Box::new(SaturatedProbe));
+
+        let mut grows = Vec::new();
+        for i in 0..WORKERS {
+            let controller = controller.clone();
+            let all = workers.clone();
+            let own = workers[i].clone();
+            grows.push(tokio::spawn(async move {
+                grow_then_lock(&controller, &own, all).await;
+            }));
+        }
+
+        let all_done = async {
+            for task in grows {
+                let _ = task.await;
+            }
+        };
+
+        let result = tokio::time::timeout(DEADLINE, all_done).await;
+        assert!(
+            result.is_ok(),
+            "concurrent grows deadlocked: the scan must not run while a worker holds its own instance lock"
+        );
+    }
+
+    /// With comfortable headroom the gate admits on the fast path without
+    /// scanning, so no worker's instance lock is taken during admission and
+    /// concurrent grows complete. Confirms the deadlock risk is specific to the
+    /// scan-under-pressure path.
+    #[test(flavor = "multi_thread", worker_threads = 4)]
+    async fn no_deadlock_with_ample_headroom() {
+        const WORKERS: usize = 32;
+        const DEADLINE: Duration = Duration::from_secs(10);
+
+        let workers = workers(WORKERS);
+        let controller = controller(Box::new(AmpleHeadroomProbe));
+
+        let mut grows = Vec::new();
+        for i in 0..WORKERS {
+            let controller = controller.clone();
+            let all = workers.clone();
+            let own = workers[i].clone();
+            grows.push(tokio::spawn(async move {
+                grow_then_lock(&controller, &own, all).await;
+            }));
+        }
+
+        let all_done = async {
+            for task in grows {
+                let _ = task.await;
+            }
+        };
+
+        let result = tokio::time::timeout(DEADLINE, all_done).await;
+        assert!(
+            result.is_ok(),
+            "grows with ample headroom should not scan and should not deadlock"
+        );
+    }
+}
diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs
index e9e8dbed8f..efd692f7c4 100644
--- a/golem-worker-executor/src/worker/mod.rs
+++ b/golem-worker-executor/src/worker/mod.rs
@@ -985,19 +985,37 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
 
     // Should only be called from invocation loop
     pub async fn increase_memory(&self, delta: u64) -> anyhow::Result<()> {
+        // The instance lock must not be held while acquiring memory permits:
+        // permit acquisition runs the admission eviction scan, which takes other
+        // workers' instance locks. Holding this worker's instance lock across
+        // that scan while another growing worker does the same is an AB-BA
+        // deadlock. So acquire the permit without the lock, then re-lock only to
+        // merge it into the running worker.
+        match &*self.instance.lock().await {
+            WorkerInstance::Running(_) => {}
+            WorkerInstance::Stopping(_)
+            | WorkerInstance::WaitingForPermit(_)
+            | WorkerInstance::Unloaded { .. }
+            | WorkerInstance::Deleting => return Ok(()),
+        }
+
+        let Some(new_permits) = self.active_workers().try_acquire(delta).await else {
+            return Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory));
+        };
+
+        // Re-check state under the lock: the worker may have changed state while
+        // permits were being acquired. If it is no longer running, drop the
+        // permits (returned to the pool on drop) and treat as a no-op, matching
+        // the non-running arms above.
         match &mut *self.instance.lock().await {
             WorkerInstance::Running(running) => {
-                if let Some(new_permits) = self.active_workers().try_acquire(delta).await {
-                    running.merge_extra_permits(new_permits);
-                    Ok(())
-                } else {
-                    Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory))
-                }
+                running.merge_extra_permits(new_permits);
+                Ok(())
             }
-            WorkerInstance::Stopping(_) => Ok(()),
-            WorkerInstance::WaitingForPermit(_) => Ok(()),
-            WorkerInstance::Unloaded { .. } => Ok(()),
-            WorkerInstance::Deleting => Ok(()),
+            WorkerInstance::Stopping(_)
+            | WorkerInstance::WaitingForPermit(_)
+            | WorkerInstance::Unloaded { .. }
+            | WorkerInstance::Deleting => Ok(()),
         }
     }
 

From be19cf460efe2c905cce2cb005eb1f58a22d4c03 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 19:00:01 -0700
Subject: [PATCH 30/60] feat: change order of tests

---
 .../cloud-density-saturation.yaml             | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
index c6cc81c813..8522d57d23 100644
--- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml
+++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
@@ -28,22 +28,6 @@
 
 name: cloud-density-saturation
 benchmarks:
-  # Synthetic footprint — each agent retains a deterministic per-agent-distinct
-  # amount of resident memory, exercising the admission/eviction path with a
-  # controllable footprint near the limit. Run first: this is the variant that
-  # actually fills memory and drives the gate to its reject/evict path.
-  # size   = number of active, memory-holding agents (the ramp axis)
-  # length = base per-agent memory footprint in bytes; each agent retains a
-  #          deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base =>
-  #          ~72 MiB average per agent, filling a ~10 GiB usable pool around
-  #          ~145 agents. The sweep brackets that ceiling and pushes well past it
-  #          so the admission gate's reject/evict behaviour near OOM is exercised.
-  - name: throughput-saturation-counters
-    iterations: 1
-    clusterSize: [2]
-    size: [50, 100, 150, 200, 300]
-    length: [16777216]
-
   # Rust echo agents — lean per-instance linear memory (the ~900 KB module is
   # charged once per component, shared across all agents; what scales per agent
   # is the small instance heap). The previous run reached the top of the sweep
@@ -66,3 +50,19 @@ benchmarks:
     clusterSize: [2]
     size: [1000, 2000, 4000, 6000, 8000]
     length: [0]
+
+  # Synthetic footprint — each agent retains a deterministic per-agent-distinct
+  # amount of resident memory, exercising the admission/eviction path with a
+  # controllable footprint near the limit. Run first: this is the variant that
+  # actually fills memory and drives the gate to its reject/evict path.
+  # size   = number of active, memory-holding agents (the ramp axis)
+  # length = base per-agent memory footprint in bytes; each agent retains a
+  #          deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base =>
+  #          ~72 MiB average per agent, filling a ~10 GiB usable pool around
+  #          ~145 agents. The sweep brackets that ceiling and pushes well past it
+  #          so the admission gate's reject/evict behaviour near OOM is exercised.
+  - name: throughput-saturation-counters
+    iterations: 1
+    clusterSize: [2]
+    size: [50, 100, 150, 200, 300]
+    length: [16777216]

From 21fd401f29847eb12bb16bbe0a16aa6c59bc447a Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 19:05:06 -0700
Subject: [PATCH 31/60] feat: restore iterations count to 3

---
 .../benchmark_suites/cloud-density-saturation.yaml          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
index 8522d57d23..0b749bc743 100644
--- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml
+++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
@@ -35,7 +35,7 @@ benchmarks:
   # eviction-churn rather than memory. Dropped the low points that told us
   # nothing and pushed the range up with coarser steps.
   - name: throughput-saturation-echo-rust
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [4000, 8000, 16000, 24000, 32000]
     length: [0]
@@ -46,7 +46,7 @@ benchmarks:
   # Heavier per agent than the Rust variant, so a lower knee — but the previous
   # run reached 2000 without saturating, so push higher and drop the low points.
   - name: throughput-saturation-echo-ts
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [1000, 2000, 4000, 6000, 8000]
     length: [0]
@@ -62,7 +62,7 @@ benchmarks:
   #          ~145 agents. The sweep brackets that ceiling and pushes well past it
   #          so the admission gate's reject/evict behaviour near OOM is exercised.
   - name: throughput-saturation-counters
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [50, 100, 150, 200, 300]
     length: [16777216]

From a9285c064cf09dce1c7281722dece4fcce0c4e0f Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 22:37:08 -0700
Subject: [PATCH 32/60] refactor(worker-executor): make cgroup gate primary,
 semaphore clamped second line

---
 .../config/debug-worker-executor.sample.env   |   6 +-
 .../config/debug-worker-executor.toml         |   6 +-
 .../config/worker-executor.sample.env         |   9 +-
 .../config/worker-executor.toml               |   9 +-
 .../services/active_workers/admission/mod.rs  |  29 +--
 .../active_workers/admission/tests.rs         | 208 ++----------------
 .../services/active_workers/memory_probe.rs   |  21 +-
 .../src/services/active_workers/mod.rs        |  40 +++-
 .../src/services/golem_config.rs              |  63 ++++--
 9 files changed, 137 insertions(+), 254 deletions(-)

diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env
index 7d95b6f7dc..3c87d1275c 100644
--- a/golem-debugging-service/config/debug-worker-executor.sample.env
+++ b/golem-debugging-service/config/debug-worker-executor.sample.env
@@ -55,11 +55,12 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
-GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
+GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9
+GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s"
@@ -231,11 +232,12 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
-GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
+GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9
+GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s"
diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml
index 316dddd29a..82e5dbdc92 100644
--- a/golem-debugging-service/config/debug-worker-executor.toml
+++ b/golem-debugging-service/config/debug-worker-executor.toml
@@ -96,10 +96,11 @@ max_oplog_query_pages_size = 100
 
 [memory]
 acquire_retry_delay = "500ms"
-admission_reserve_bytes = 268435456
 component_size_coefficient = 2.0
 enable_measured_admission = true
 worker_estimate_coefficient = 1.1
+worker_memory_max_safe_ratio = 0.9
+worker_memory_overcommit_ratio = 1.2
 worker_memory_ratio = 0.8
 
 [memory.oom_retry_config]
@@ -367,10 +368,11 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
-# admission_reserve_bytes = 268435456
 # component_size_coefficient = 2.0
 # enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
+# worker_memory_max_safe_ratio = 0.9
+# worker_memory_overcommit_ratio = 1.2
 # worker_memory_ratio = 0.8
 # 
 # [memory.oom_retry_config]
diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env
index bc7bf2c3c0..4cd9a25b12 100644
--- a/golem-worker-executor/config/worker-executor.sample.env
+++ b/golem-worker-executor/config/worker-executor.sample.env
@@ -72,11 +72,12 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
-GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
+GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9
+GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s"
@@ -294,11 +295,12 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
-GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
+GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9
+GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s"
@@ -486,11 +488,12 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
-GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456
 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
+GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9
+GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s"
diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml
index 819c4fe03d..f5a8cd0183 100644
--- a/golem-worker-executor/config/worker-executor.toml
+++ b/golem-worker-executor/config/worker-executor.toml
@@ -125,10 +125,11 @@ max_oplog_query_pages_size = 100
 
 [memory]
 acquire_retry_delay = "500ms"
-admission_reserve_bytes = 268435456
 component_size_coefficient = 2.0
 enable_measured_admission = true
 worker_estimate_coefficient = 1.1
+worker_memory_max_safe_ratio = 0.9
+worker_memory_overcommit_ratio = 1.2
 worker_memory_ratio = 0.8
 
 [memory.oom_retry_config]
@@ -459,10 +460,11 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
-# admission_reserve_bytes = 268435456
 # component_size_coefficient = 2.0
 # enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
+# worker_memory_max_safe_ratio = 0.9
+# worker_memory_overcommit_ratio = 1.2
 # worker_memory_ratio = 0.8
 # 
 # [memory.oom_retry_config]
@@ -763,10 +765,11 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
-# admission_reserve_bytes = 268435456
 # component_size_coefficient = 2.0
 # enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
+# worker_memory_max_safe_ratio = 0.9
+# worker_memory_overcommit_ratio = 1.2
 # worker_memory_ratio = 0.8
 # 
 # [memory.oom_retry_config]
diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs
index 702dc003e7..d89f710859 100644
--- a/golem-worker-executor/src/services/active_workers/admission/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs
@@ -16,11 +16,13 @@
 //!
 //! Gates worker admission on the executor environment's *real* memory headroom
 //! read from the [`MemoryProbe`], rather than on the estimate-based semaphore in
-//! [`super::ActiveWorkers`]. The two work together: the semaphore is a cheap,
-//! high-frequency pre-filter over reserved-but-not-yet-resident intent; this
-//! controller is the authoritative check against measured resident usage. When
-//! headroom is short it evicts already-resident idle-then-warm work; if it still
-//! cannot make room it rejects rather than over-committing.
+//! [`super::ActiveWorkers`]. This controller is the primary, authoritative
+//! check against measured resident usage and refuses admission in normal
+//! operation; the estimate semaphore is the second line of defence behind it,
+//! its atomic permit acquisition catching the concurrent admissions this
+//! (lockless) controller can let through on the same snapshot. When headroom is
+//! short it evicts already-resident idle-then-warm work; if it still cannot make
+//! room it rejects rather than over-committing.
 //!
 //! The controller is decoupled from `Worker`/wasmtime via the [`EvictionSource`]
 //! trait so its decision logic can be exercised in isolation with synthetic
@@ -67,23 +69,14 @@ pub enum AdmissionDecision {
 
 /// Configuration for the headroom-based admission decision.
 ///
-/// Two knobs with distinct jobs:
-///
 /// * `usable_ratio` — fraction of the measured limit usable for WASM admission.
 ///   The remainder is left for the host (the executor process, allocator
 ///   arenas, runtime buffers). Mirrors `worker_memory_ratio`, but applied to the
 ///   measured limit rather than the configured total.
-///
-/// * `reserve_bytes` — margin kept free below the carve-out ceiling to absorb
-///   the window in which concurrent admissions are observed before becoming
-///   resident. Its sufficiency under concurrency is asserted by the property
-///   test in `tests.rs`.
 #[derive(Debug, Clone, Copy)]
 pub struct AdmissionPolicy {
     /// Fraction (0.0..=1.0) of the measured limit usable for WASM admission.
     pub usable_ratio: f64,
-    /// Dynamic safety margin kept free below the carve-out ceiling.
-    pub reserve_bytes: u64,
 }
 
 /// Decides admission against measured headroom, evicting resident idle/warm
@@ -100,14 +93,12 @@ impl AdmissionController {
     }
 
     /// Bytes available for new admissions: the carve-out ceiling
-    /// (`usable_ratio × limit`) minus current usage minus the reserve.
-    /// Saturating throughout — never underflows when already over a ceiling.
+    /// (`usable_ratio × limit`) minus current usage. Saturating — never
+    /// underflows when already over the ceiling.
     fn admissible_headroom(&self) -> u64 {
         let snapshot = self.probe.snapshot();
         let ceiling = (snapshot.limit_bytes as f64 * self.policy.usable_ratio) as u64;
-        ceiling
-            .saturating_sub(snapshot.current_bytes)
-            .saturating_sub(self.policy.reserve_bytes)
+        ceiling.saturating_sub(snapshot.current_bytes)
     }
 
     /// Decide whether `request_bytes` can be admitted, evicting from `source` if
diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs
index 8eca9e157c..4996b97be7 100644
--- a/golem-worker-executor/src/services/active_workers/admission/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs
@@ -109,23 +109,16 @@ impl EvictionSource for FakeEvictionSource {
     }
 }
 
-fn controller(state: Arc<Mutex<EnvState>>, reserve_bytes: u64) -> AdmissionController {
-    controller_with_ratio(state, 1.0, reserve_bytes)
+fn controller(state: Arc<Mutex<EnvState>>) -> AdmissionController {
+    controller_with_ratio(state, 1.0)
 }
 
-fn controller_with_ratio(
-    state: Arc<Mutex<EnvState>>,
-    usable_ratio: f64,
-    reserve_bytes: u64,
-) -> AdmissionController {
+fn controller_with_ratio(state: Arc<Mutex<EnvState>>, usable_ratio: f64) -> AdmissionController {
     AdmissionController::new(
         Box::new(FakeProbe {
             state: state.clone(),
         }),
-        AdmissionPolicy {
-            usable_ratio,
-            reserve_bytes,
-        },
+        AdmissionPolicy { usable_ratio },
     )
 }
 
@@ -156,7 +149,7 @@ async fn admits_when_headroom_is_ample_without_evicting() {
         }],
         ..Default::default()
     }));
-    let ctrl = controller(state.clone(), 0);
+    let ctrl = controller(state.clone());
     let source = FakeEvictionSource {
         state: state.clone(),
     };
@@ -186,7 +179,7 @@ async fn evicts_idle_before_warm() {
     }));
     // usage = 800, limit = 1000, headroom = 200. Request 300 → shortfall 100.
     // One idle (400) covers it; warm must remain untouched.
-    let ctrl = controller(state.clone(), 0);
+    let ctrl = controller(state.clone());
     let source = FakeEvictionSource {
         state: state.clone(),
     };
@@ -208,7 +201,7 @@ async fn rejects_when_nothing_can_be_freed() {
         residents: vec![],
         ..Default::default()
     }));
-    let ctrl = controller(state.clone(), 0);
+    let ctrl = controller(state.clone());
     let source = FakeEvictionSource {
         state: state.clone(),
     };
@@ -219,31 +212,6 @@ async fn rejects_when_nothing_can_be_freed() {
     assert_eq!(state.lock().unwrap().usage(), 950);
 }
 
-#[test]
-async fn reserve_is_kept_free() {
-    let state = Arc::new(Mutex::new(EnvState {
-        limit: 1000,
-        pinned_usage: 700,
-        residents: vec![],
-        ..Default::default()
-    }));
-    // headroom = 300, reserve = 200 → admissible = 100. Request 150 → reject.
-    let ctrl = controller(state.clone(), 200);
-    let source = FakeEvictionSource {
-        state: state.clone(),
-    };
-
-    assert_eq!(
-        apply_admit(&ctrl, &source, &state, 150).await,
-        AdmissionDecision::Reject
-    );
-    // But a request within the admissible window succeeds.
-    assert_eq!(
-        apply_admit(&ctrl, &source, &state, 100).await,
-        AdmissionDecision::Admit
-    );
-}
-
 // ── Property tests ───────────────────────────────────────────────────────────
 
 #[derive(Debug, Clone)]
@@ -295,12 +263,11 @@ fn arb_fitting_state(
 
 proptest! {
     /// Safety invariant: across any random sequence of admits — with random
-    /// pre-resident work, random sizes, and a random reserve — modeled usage
-    /// must never exceed the limit. This is the property that rules out OOM.
+    /// pre-resident work and random sizes — modeled usage must never exceed the
+    /// limit. This is the property that rules out OOM.
     #[test]
     fn usage_never_exceeds_limit(
         (limit, residents) in arb_fitting_state(500..5000, 20),
-        reserve in 0u64..300,
         ops in arb_ops(),
     ) {
         let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
@@ -311,7 +278,7 @@ proptest! {
                 residents,
                 ..Default::default()
             }));
-            let ctrl = controller(state.clone(), reserve);
+            let ctrl = controller(state.clone());
             let source = FakeEvictionSource { state: state.clone() };
 
             for op in ops {
@@ -350,7 +317,7 @@ proptest! {
                 residents,
                 ..Default::default()
             }));
-            let ctrl = controller(state.clone(), 0);
+            let ctrl = controller(state.clone());
             let source = FakeEvictionSource { state: state.clone() };
 
             for op in ops {
@@ -381,7 +348,7 @@ proptest! {
                 residents,
                 ..Default::default()
             }));
-            let ctrl = controller(state.clone(), 0);
+            let ctrl = controller(state.clone());
             let source = FakeEvictionSource { state: state.clone() };
 
             for op in ops {
@@ -420,7 +387,7 @@ async fn usable_ratio_caps_admission_below_full_limit() {
     }));
     // ceiling = 0.8 * 1000 = 800. Request 850 must be rejected even though the
     // raw limit (1000) would allow it — the top 20% is reserved for the host.
-    let ctrl = controller_with_ratio(state.clone(), 0.8, 0);
+    let ctrl = controller_with_ratio(state.clone(), 0.8);
     let source = FakeEvictionSource {
         state: state.clone(),
     };
@@ -435,150 +402,6 @@ async fn usable_ratio_caps_admission_below_full_limit() {
     );
 }
 
-// ── Concurrency ──────────────────────────────────────────────────────────────
-//
-// In production, each admission reads headroom (`try_admit`) and then separately
-// commits to the upstream atomic permit (modeled here as `pinned_usage +=
-// request`). The two steps are not serialised across concurrent admissions, so
-// several admissions can read the same pre-commit snapshot, all pass the check,
-// and all commit. The `reserve` margin accounts for this instead of a lock:
-// concurrent admissions may push usage above the carve-out ceiling into the
-// reserve, but must not push it above the true `limit`.
-//
-// These tests force the maximum-overlap case with a barrier: every admission
-// completes its headroom check before any admission commits. This makes the
-// maximum overshoot deterministic rather than dependent on task scheduling, so
-// an undersized reserve is reliably detected and a correctly sized one is
-// actually exercised.
-
-/// Run `racers` admissions of `request` bytes against a fresh environment with
-/// the given `reserve`, forcing all headroom checks to complete before any
-/// commit (maximum overlap). Returns the final environment usage and the number
-/// of admits granted.
-async fn race_admissions_worst_case(
-    limit: u64,
-    initial_pinned: u64,
-    reserve: u64,
-    racers: usize,
-    request: u64,
-) -> (u64, usize) {
-    let state = Arc::new(Mutex::new(EnvState {
-        limit,
-        pinned_usage: initial_pinned,
-        residents: vec![],
-        ..Default::default()
-    }));
-    let ctrl = Arc::new(controller_with_ratio(state.clone(), 1.0, reserve));
-    // All racers check before any commits: the maximum-overlap schedule.
-    let barrier = Arc::new(tokio::sync::Barrier::new(racers));
-
-    let mut handles = Vec::new();
-    for _ in 0..racers {
-        let ctrl = ctrl.clone();
-        let state = state.clone();
-        let barrier = barrier.clone();
-        handles.push(tokio::spawn(async move {
-            let source = FakeEvictionSource {
-                state: state.clone(),
-            };
-            let decision = ctrl.try_admit(request, &source).await;
-            // Hold every racer here until all have decided against the same
-            // pre-commit snapshot, then let the commits run together.
-            barrier.wait().await;
-            if decision == AdmissionDecision::Admit {
-                state.lock().unwrap().pinned_usage += request;
-                true
-            } else {
-                false
-            }
-        }));
-    }
-    let mut admitted = 0;
-    for h in handles {
-        if h.await.unwrap() {
-            admitted += 1;
-        }
-    }
-    let usage = state.lock().unwrap().usage();
-    (usage, admitted)
-}
-
-proptest! {
-    /// A reserve sized for the maximum concurrent overshoot keeps real usage
-    /// under the limit even when every racer checks before any commits, with a
-    /// non-trivial near-ceiling pinned base.
-    ///
-    /// Sizing: at most all `racers` can pass against the same pre-commit
-    /// snapshot, so the reserve must cover `racers × request` landing in the
-    /// window between check and commit. With that margin, usage stays
-    /// `<= limit`.
-    #[test]
-    fn sufficient_reserve_holds_under_worst_case_overlap(
-        racers in 2usize..16,
-        request in 50u64..400,
-        base_fill in 0u64..2000,
-    ) {
-        let reserve = request * racers as u64;
-        // Limit leaves room for the pre-existing fill, the reserve, and at least
-        // one request's worth of admissible headroom above the reserve.
-        let limit = base_fill + reserve + request + 500;
-
-        let rt = tokio::runtime::Builder::new_multi_thread()
-            .worker_threads(4)
-            .build()
-            .unwrap();
-        rt.block_on(async move {
-            let (usage, _) =
-                race_admissions_worst_case(limit, base_fill, reserve, racers, request).await;
-            prop_assert!(
-                usage <= limit,
-                "maximum overlap drove usage {usage} past limit {limit}"
-            );
-            Ok(())
-        }).unwrap();
-    }
-
-    /// With no reserve and maximum overlap forced, several racers admitting at
-    /// once must push usage above the carve-out ceiling. This confirms the race
-    /// the design tolerates is real and this harness reproduces it; without it,
-    /// the safety test above could pass without ever exercising a concurrent
-    /// overshoot. Usage may still stay under `limit`; the assertion is on the
-    /// overshoot past the ceiling.
-    #[test]
-    fn worst_case_overlap_overshoots_ceiling_without_reserve(
-        racers in 2usize..12,
-        request in 50u64..400,
-    ) {
-        // Ceiling headroom sized for exactly one request; no reserve cushion.
-        let ceiling = request;
-        let limit = request * racers as u64 + 1000;
-
-        let rt = tokio::runtime::Builder::new_multi_thread()
-            .worker_threads(4)
-            .build()
-            .unwrap();
-        rt.block_on(async move {
-            // pinned = limit - ceiling so admissible headroom is exactly one
-            // request; with reserve 0, every racer sees room for itself.
-            let pinned = limit - ceiling;
-            let (usage, admitted) =
-                race_admissions_worst_case(limit, pinned, 0, racers, request).await;
-            // More than one admit means the gate let concurrent racers through
-            // on the same snapshot.
-            prop_assert!(
-                admitted >= 2,
-                "expected concurrent over-admission with no reserve, got {admitted} admits"
-            );
-            prop_assert!(
-                usage > ceiling + pinned,
-                "usage {usage} did not overshoot the ceiling {}",
-                ceiling + pinned
-            );
-            Ok(())
-        }).unwrap();
-    }
-}
-
 /// Concurrent memory grows must not deadlock against the admission eviction
 /// scan.
 ///
@@ -666,10 +489,7 @@ mod grow_lock_ordering {
     fn controller(probe: Box<dyn MemoryProbe>) -> Arc<AdmissionController> {
         Arc::new(AdmissionController::new(
             probe,
-            AdmissionPolicy {
-                usable_ratio: 1.0,
-                reserve_bytes: 0,
-            },
+            AdmissionPolicy { usable_ratio: 1.0 },
         ))
     }
 
diff --git a/golem-worker-executor/src/services/active_workers/memory_probe.rs b/golem-worker-executor/src/services/active_workers/memory_probe.rs
index 346a3dd363..6a26b3dd25 100644
--- a/golem-worker-executor/src/services/active_workers/memory_probe.rs
+++ b/golem-worker-executor/src/services/active_workers/memory_probe.rs
@@ -47,9 +47,10 @@ impl MemorySnapshot {
     }
 }
 
-/// Reads the executor environment's real memory state. Cheap enough to sample
-/// at admission time, but not on every wasmtime `memory.grow` (that is what the
-/// estimate-semaphore pre-check absorbs).
+/// Reads the executor environment's real memory state. Sampled at every
+/// admission attempt, including each wasmtime `memory.grow`, so it must be
+/// cheap: the cgroup v2 backend is two small file reads independent of the
+/// number of resident workers.
 pub trait MemoryProbe: Send + Sync + Debug {
     fn snapshot(&self) -> MemorySnapshot;
 
@@ -172,6 +173,10 @@ impl MemoryProbe for CgroupV2Probe {
 /// (falling back to host RAM / process RSS otherwise).
 pub fn default_probe(memory_override: Option<u64>) -> Box<dyn MemoryProbe> {
     if let Some(limit) = memory_override {
+        tracing::info!(
+            limit_bytes = limit,
+            "Memory probe: ProcessRssProbe (limit pinned by system_memory_override)"
+        );
         return Box::new(ProcessRssProbe::new(limit));
     }
 
@@ -184,8 +189,18 @@ pub fn default_probe(memory_override: Option<u64>) -> Box<dyn MemoryProbe> {
     #[cfg(target_os = "linux")]
     {
         if let Some(probe) = CgroupV2Probe::try_new(host_ram) {
+            let snapshot = probe.snapshot();
+            tracing::info!(
+                limit_bytes = snapshot.limit_bytes,
+                current_bytes = snapshot.current_bytes,
+                "Memory probe: CgroupV2Probe (cgroup memory.max/current)"
+            );
             return Box::new(probe);
         }
     }
+    tracing::info!(
+        limit_bytes = host_ram,
+        "Memory probe: ProcessRssProbe (host RAM, no cgroup v2 limit)"
+    );
     Box::new(ProcessRssProbe::new(host_ram))
 }
diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs
index 7e95da1703..0b8e02fa38 100644
--- a/golem-worker-executor/src/services/active_workers/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/mod.rs
@@ -80,10 +80,12 @@ pub struct ActiveWorkers<Ctx: WorkerCtx> {
     acquire_retry_delay: Duration,
     /// Authoritative measured-headroom admission gate. Decides whether real
     /// memory headroom permits a new acquisition, evicting via the worker set
-    /// when short. The estimate-based `worker_memory` semaphore is the cheap
-    /// pre-filter and atomic commit in front of it. `None` when measured
-    /// admission is disabled (e.g. shared test environments) — admission then
-    /// relies on the estimate semaphore alone.
+    /// when short, and is what refuses admission in normal operation. The
+    /// estimate-based `worker_memory` semaphore is the second line of defence
+    /// behind it: its atomic permit acquisition catches the concurrent
+    /// admissions the lockless gate can let through on the same snapshot. `None`
+    /// when measured admission is disabled (e.g. shared test environments) —
+    /// admission then relies on the estimate semaphore alone.
     admission: Option<AdmissionController>,
     /// Charges each resident component's compiled module size to the estimate
     /// pool exactly once (shared across all its workers) rather than per worker.
@@ -138,13 +140,14 @@ impl Drop for WorkerMemoryPermit {
 
 impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
     pub fn new(memory_config: &MemoryConfig, storage_config: &FilesystemStorageConfig) -> Self {
-        let worker_memory_size = memory_config.worker_memory();
-        let admission = memory_config.enable_measured_admission.then(|| {
-            AdmissionController::new(
-                default_probe(memory_config.system_memory_override),
-                memory_config.admission_policy(),
-            )
-        });
+        // Build the probe once and size both admission layers from its reported
+        // limit, so the estimate semaphore and the measured-headroom gate share
+        // a single basis (the pod's cgroup limit when constrained, not host RAM).
+        let probe = default_probe(memory_config.system_memory_override);
+        let worker_memory_size = memory_config.worker_memory_for_limit(probe.limit_bytes());
+        let admission = memory_config
+            .enable_measured_admission
+            .then(|| AdmissionController::new(probe, memory_config.admission_policy()));
         let workers = Cache::new(
             None,
             FullCacheEvictionMode::None,
@@ -273,6 +276,15 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
             .expect("requested memory size is too large");
 
         loop {
+            // Blocking acquire: retry until the request can be admitted. A
+            // rejection here is transient, not terminal. The gate reads resident
+            // memory from the probe, which lags real usage (cgroup
+            // `memory.current` only counts already-touched pages), so a worker
+            // admitted earlier may not yet be fully resident; pressure eases as
+            // its pages settle and as other workers finish and release pool
+            // permits. Each iteration backs off, re-reads the gate, and re-tries
+            // the pool, so the caller eventually proceeds once headroom recovers
+            // rather than failing under momentary pressure.
             // Authoritative measured-headroom gate (when enabled). Evicts
             // idle-then-warm when real headroom is short; rejects (and we back
             // off) when it cannot make room rather than risking the limit.
@@ -285,7 +297,11 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
                 continue;
             }
 
-            // Estimate-semaphore pool: cheap pre-check + atomic commit.
+            // Estimate-semaphore pool: the second line of defence behind the
+            // gate. Its atomic permit acquisition catches the concurrent
+            // admissions the lockless gate can let through on the same snapshot.
+            // Sized above the gate ceiling (but clamped below the limit), so it
+            // rarely binds first — the gate refuses in normal operation.
             if let Some(permit) = acquire_pool_permit(
                 &self.worker_memory,
                 &self.workers,
diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs
index 5a95e0056f..9a53176160 100644
--- a/golem-worker-executor/src/services/golem_config.rs
+++ b/golem-worker-executor/src/services/golem_config.rs
@@ -966,10 +966,21 @@ pub struct MemoryConfig {
     /// Multiplier applied to a component's `component_size`, charged once per
     /// resident component (shared across all its workers) rather than per worker.
     pub component_size_coefficient: f64,
-    /// Bytes of measured headroom kept free below the usable ceiling as a margin
-    /// against concurrent admissions overshooting before becoming resident. Used
-    /// by the measured-headroom admission gate.
-    pub admission_reserve_bytes: u64,
+    /// Multiplier (typically > 1.0) applied to the measured limit when sizing the
+    /// estimate semaphore. The estimate per worker is normally larger than its
+    /// real resident usage, so the semaphore is allowed to authorize more
+    /// estimated bytes than the limit: it is the second line of defence behind
+    /// the measured-headroom gate, catching the concurrent-admission race the
+    /// (lockless) gate cannot, while the gate refuses first in normal operation
+    /// against real usage. Always clamped by `worker_memory_max_safe_ratio` so it
+    /// can never itself authorise real usage past a safe fraction of the limit.
+    pub worker_memory_overcommit_ratio: f64,
+    /// Hard upper bound (fraction of the measured limit, < 1.0) on the estimate
+    /// semaphore size, regardless of `worker_memory_overcommit_ratio`. Keeps the
+    /// semaphore below the true limit so headroom always remains for the wasmtime
+    /// host even if the semaphore is the binding guard and estimates happen to
+    /// match real usage.
+    pub worker_memory_max_safe_ratio: f64,
     /// Whether the measured-headroom admission gate is active. Requires the
     /// executor to own its memory environment (its own cgroup/process), as in a
     /// production pod. Disable in shared environments — such as the in-process
@@ -983,12 +994,14 @@ pub struct MemoryConfig {
 }
 
 impl MemoryConfig {
+    /// The memory limit this executor must stay under, resolved through the same
+    /// probe the admission gate uses: the cgroup `memory.max` of the pod on a
+    /// constrained Linux deployment, the configured override when set, and host
+    /// RAM only when the process is genuinely unconstrained. In a container this
+    /// is the pod's ceiling, not the host's total RAM.
     pub fn total_system_memory(&self) -> u64 {
-        self.system_memory_override.unwrap_or_else(|| {
-            let mut sysinfo = sysinfo::System::new();
-            sysinfo.refresh_memory();
-            sysinfo.total_memory()
-        })
+        crate::services::active_workers::memory_probe::default_probe(self.system_memory_override)
+            .limit_bytes()
     }
 
     pub fn system_memory(&self) -> u64 {
@@ -997,18 +1010,30 @@ impl MemoryConfig {
         sysinfo.available_memory()
     }
 
+    /// Size of the estimate semaphore: the measured limit scaled by the
+    /// overcommit ratio, then clamped to `worker_memory_max_safe_ratio` of the
+    /// limit. The overcommit lets the semaphore sit slightly above the gate
+    /// ceiling as a second line of defence (per-worker estimates exceed real
+    /// usage, so it rarely binds first); the clamp guarantees it can never be
+    /// sized to authorise real usage past a safe fraction of the limit, leaving
+    /// headroom for the wasmtime host.
+    pub fn worker_memory_for_limit(&self, limit_bytes: u64) -> usize {
+        let limit = limit_bytes as f64;
+        let overcommit = limit * self.worker_memory_overcommit_ratio;
+        let safe_cap = limit * self.worker_memory_max_safe_ratio;
+        overcommit.min(safe_cap) as usize
+    }
+
     pub fn worker_memory(&self) -> usize {
-        (self.total_system_memory() as f64 * self.worker_memory_ratio) as usize
+        self.worker_memory_for_limit(self.total_system_memory())
     }
 
     /// The admission policy for the measured-headroom gate. Reuses
     /// `worker_memory_ratio` as the usable fraction of the measured limit (the
-    /// host keeps the remainder) and `admission_reserve_bytes` as the concurrent
-    /// overshoot margin.
+    /// host keeps the remainder).
     pub fn admission_policy(&self) -> crate::services::active_workers::admission::AdmissionPolicy {
         crate::services::active_workers::admission::AdmissionPolicy {
             usable_ratio: self.worker_memory_ratio,
-            reserve_bytes: self.admission_reserve_bytes,
         }
     }
 }
@@ -1036,8 +1061,13 @@ impl SafeDisplay for MemoryConfig {
         );
         let _ = writeln!(
             &mut result,
-            "admission reserve bytes: {}",
-            self.admission_reserve_bytes
+            "worker memory overcommit ratio: {}",
+            self.worker_memory_overcommit_ratio
+        );
+        let _ = writeln!(
+            &mut result,
+            "worker memory max safe ratio: {}",
+            self.worker_memory_max_safe_ratio
         );
         let _ = writeln!(
             &mut result,
@@ -1569,7 +1599,8 @@ impl Default for MemoryConfig {
             worker_memory_ratio: 0.8,
             worker_estimate_coefficient: 1.1,
             component_size_coefficient: 2.0,
-            admission_reserve_bytes: 256 * 1024 * 1024,
+            worker_memory_overcommit_ratio: 1.2,
+            worker_memory_max_safe_ratio: 0.9,
             enable_measured_admission: true,
             acquire_retry_delay: Duration::from_millis(500),
             oom_retry_config: RetryConfig {

From 27119b28eec1e0bd3db73438edcb6edba021ff5d Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Tue, 9 Jun 2026 23:12:01 -0700
Subject: [PATCH 33/60] feat: run only initial echo test to make sure we did
 not make it slower

---
 .../benchmark_suites/cloud-perf.yaml          | 172 +++++++++---------
 1 file changed, 86 insertions(+), 86 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index d508c5b5fc..21ef48352a 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -31,100 +31,100 @@ benchmarks:
   - name: throughput-echo
     iterations: 3
     clusterSize: [2]
-    size: [1, 50, 100, 250]
+    size: [1, 10, 50, 100, 250]
     length: [1000]
 
-  # size   = number of workers per implementation
-  # length = payload size in bytes sent to large_input
-  # NOTE: large payloads grow worker linear memory, so this is the throughput
-  # benchmark most relevant to the memory-admission investigation — sized to
-  # match throughput-echo so it exercises real density.
-  - name: throughput-large-input
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 50, 100, 250]
-    length: [100, 10000]
+  # # size   = number of workers per implementation
+  # # length = payload size in bytes sent to large_input
+  # # NOTE: large payloads grow worker linear memory, so this is the throughput
+  # # benchmark most relevant to the memory-admission investigation — sized to
+  # # match throughput-echo so it exercises real density.
+  # - name: throughput-large-input
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 50, 100, 250]
+  #   length: [100, 10000]
 
-  # size   = number of workers per implementation
-  # length = CPU work length passed to cpu_intensive
-  - name: throughput-cpu-intensive
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 50, 100, 250]
-    length: [100]
+  # # size   = number of workers per implementation
+  # # length = CPU work length passed to cpu_intensive
+  # - name: throughput-cpu-intensive
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 50, 100, 250]
+  #   length: [100]
 
-  # Cold-start: compilation cache disabled — measures true cold-start latency
-  # with no warm compiled artefact available.
-  # size   = number of unique components created (each in its own env)
-  # length = seconds to wait per component for pre-compilation warm-up
-  - name: cold-start-unknown-small
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 25, 50]
-    length: [2]
-    disableCompilationCache: true
+  # # Cold-start: compilation cache disabled — measures true cold-start latency
+  # # with no warm compiled artefact available.
+  # # size   = number of unique components created (each in its own env)
+  # # length = seconds to wait per component for pre-compilation warm-up
+  # - name: cold-start-unknown-small
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 25, 50]
+  #   length: [2]
+  #   disableCompilationCache: true
 
-  - name: cold-start-unknown-medium
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 25, 50]
-    length: [5]
-    disableCompilationCache: true
+  # - name: cold-start-unknown-medium
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 25, 50]
+  #   length: [5]
+  #   disableCompilationCache: true
 
-  # Cold-start: compilation cache enabled — measures latency once the compiled
-  # artefact is available in the cache.
-  # size   = number of unique components created (each in its own env)
-  # length = seconds to wait per component for pre-compilation warm-up
-  # NOTE: if results here are close to the cache-disabled entries above, the
-  # warm-up wait is too short and compilation hasn't finished — bump length.
-  - name: cold-start-unknown-small
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 25, 50]
-    length: [2]
+  # # Cold-start: compilation cache enabled — measures latency once the compiled
+  # # artefact is available in the cache.
+  # # size   = number of unique components created (each in its own env)
+  # # length = seconds to wait per component for pre-compilation warm-up
+  # # NOTE: if results here are close to the cache-disabled entries above, the
+  # # warm-up wait is too short and compilation hasn't finished — bump length.
+  # - name: cold-start-unknown-small
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 25, 50]
+  #   length: [2]
 
-  - name: cold-start-unknown-medium
-    iterations: 3
-    clusterSize: [2]
-    size: [1, 5, 10, 25, 50]
-    length: [5]
+  # - name: cold-start-unknown-medium
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1, 5, 10, 25, 50]
+  #   length: [5]
 
-  # Invocation latency — hot and cold paths through the Gateway NLB.
-  # Large worker counts to stress the load balancer and connection pool.
-  # size   = number of workers created
-  # length = number of hot invocations per worker after the first cold one
-  - name: latency-small
-    iterations: 3
-    clusterSize: [2]
-    size: [100, 500, 1000, 2000, 5000]
-    length: [2]
+  # # Invocation latency — hot and cold paths through the Gateway NLB.
+  # # Large worker counts to stress the load balancer and connection pool.
+  # # size   = number of workers created
+  # # length = number of hot invocations per worker after the first cold one
+  # - name: latency-small
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [100, 500, 1000, 2000, 5000]
+  #   length: [2]
 
-  - name: latency-medium
-    iterations: 3
-    clusterSize: [2]
-    size: [100, 500, 1000, 2000]
-    length: [5]
+  # - name: latency-medium
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [100, 500, 1000, 2000]
+  #   length: [5]
 
-  # Sleep — measures worker suspension and resumption under real network
-  # conditions. High residency: all `size` workers held in memory sleeping at
-  # once, so this also probes how many resident workers fit (memory-admission
-  # relevant) — pushed past the ~2000 echo proved out.
-  # size   = number of workers launched in parallel
-  # length = sleep duration in milliseconds
-  - name: sleep
-    iterations: 3
-    clusterSize: [2]
-    size: [10, 100, 500, 1000, 2000]
-    length: [10000]
+  # # Sleep — measures worker suspension and resumption under real network
+  # # conditions. High residency: all `size` workers held in memory sleeping at
+  # # once, so this also probes how many resident workers fit (memory-admission
+  # # relevant) — pushed past the ~2000 echo proved out.
+  # # size   = number of workers launched in parallel
+  # # length = sleep duration in milliseconds
+  # - name: sleep
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [10, 100, 500, 1000, 2000]
+  #   length: [10000]
 
-  # Durability overhead — measures the cost of durable vs ephemeral execution
-  # across four variants (durable-persistent, durable-non-persistent,
-  # ephemeral, durable-persistent-commit). size workers concurrent per phase;
-  # sized up to put real load on the oplog/persistence/storage path.
-  # size   = number of workers per variant
-  # length = loop iteration count passed to oplog_heavy
-  - name: durability-overhead
-    iterations: 3
-    clusterSize: [2]
-    size: [10, 50, 100, 250]
-    length: [5000]
+  # # Durability overhead — measures the cost of durable vs ephemeral execution
+  # # across four variants (durable-persistent, durable-non-persistent,
+  # # ephemeral, durable-persistent-commit). size workers concurrent per phase;
+  # # sized up to put real load on the oplog/persistence/storage path.
+  # # size   = number of workers per variant
+  # # length = loop iteration count passed to oplog_heavy
+  # - name: durability-overhead
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [10, 50, 100, 250]
+  #   length: [5000]

From b608593bad07c1049b645b6e0414de77fc618e3a Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Wed, 10 Jun 2026 01:48:34 -0700
Subject: [PATCH 34/60] feat: run only saturation test

---
 .../cloud-density-saturation.yaml             | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
index 0b749bc743..3c2f6a6b1f 100644
--- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml
+++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
@@ -28,28 +28,28 @@
 
 name: cloud-density-saturation
 benchmarks:
-  # Rust echo agents — lean per-instance linear memory (the ~900 KB module is
-  # charged once per component, shared across all agents; what scales per agent
-  # is the small instance heap). The previous run reached the top of the sweep
-  # (12000) without saturating pod memory, so the knee here is throughput /
-  # eviction-churn rather than memory. Dropped the low points that told us
-  # nothing and pushed the range up with coarser steps.
-  - name: throughput-saturation-echo-rust
-    iterations: 3
-    clusterSize: [2]
-    size: [4000, 8000, 16000, 24000, 32000]
-    length: [0]
+  # # Rust echo agents — lean per-instance linear memory (the ~900 KB module is
+  # # charged once per component, shared across all agents; what scales per agent
+  # # is the small instance heap). The previous run reached the top of the sweep
+  # # (12000) without saturating pod memory, so the knee here is throughput /
+  # # eviction-churn rather than memory. Dropped the low points that told us
+  # # nothing and pushed the range up with coarser steps.
+  # - name: throughput-saturation-echo-rust
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [4000, 8000, 16000, 24000, 32000]
+  #   length: [0]
 
-  # TypeScript echo agents — each instance instantiates its own QuickJS runtime
-  # and JS heap in its own linear memory (the 17.4 MB module is shared once per
-  # component; the per-instance runtime state is the heavy per-agent cost).
-  # Heavier per agent than the Rust variant, so a lower knee — but the previous
-  # run reached 2000 without saturating, so push higher and drop the low points.
-  - name: throughput-saturation-echo-ts
-    iterations: 3
-    clusterSize: [2]
-    size: [1000, 2000, 4000, 6000, 8000]
-    length: [0]
+  # # TypeScript echo agents — each instance instantiates its own QuickJS runtime
+  # # and JS heap in its own linear memory (the 17.4 MB module is shared once per
+  # # component; the per-instance runtime state is the heavy per-agent cost).
+  # # Heavier per agent than the Rust variant, so a lower knee — but the previous
+  # # run reached 2000 without saturating, so push higher and drop the low points.
+  # - name: throughput-saturation-echo-ts
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1000, 2000, 4000, 6000, 8000]
+  #   length: [0]
 
   # Synthetic footprint — each agent retains a deterministic per-agent-distinct
   # amount of resident memory, exercising the admission/eviction path with a
@@ -62,7 +62,7 @@ benchmarks:
   #          ~145 agents. The sweep brackets that ceiling and pushes well past it
   #          so the admission gate's reject/evict behaviour near OOM is exercised.
   - name: throughput-saturation-counters
-    iterations: 3
+    iterations: 1
     clusterSize: [2]
     size: [50, 100, 150, 200, 300]
     length: [16777216]

From 1f1b77a4a36db34b1e27becf185678208236316f Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Wed, 10 Jun 2026 02:22:28 -0700
Subject: [PATCH 35/60] feat: bigger saturation spread

---
 .../cloud-density-saturation.yaml             | 70 +++++++++----------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
index 3c2f6a6b1f..bd9819524e 100644
--- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml
+++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
@@ -28,41 +28,41 @@
 
 name: cloud-density-saturation
 benchmarks:
-  # # Rust echo agents — lean per-instance linear memory (the ~900 KB module is
-  # # charged once per component, shared across all agents; what scales per agent
-  # # is the small instance heap). The previous run reached the top of the sweep
-  # # (12000) without saturating pod memory, so the knee here is throughput /
-  # # eviction-churn rather than memory. Dropped the low points that told us
-  # # nothing and pushed the range up with coarser steps.
-  # - name: throughput-saturation-echo-rust
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [4000, 8000, 16000, 24000, 32000]
-  #   length: [0]
+  # Rust echo agents — lean per-instance linear memory (the ~900 KB module is
+  # charged once per component, shared across all agents; what scales per agent
+  # is the small instance heap). The previous run reached the top of the sweep
+  # (12000) without saturating pod memory, so the knee here is throughput /
+  # eviction-churn rather than memory. Dropped the low points that told us
+  # nothing and pushed the range up with coarser steps.
+  - name: throughput-saturation-echo-rust
+    iterations: 3
+    clusterSize: [2]
+    size: [2000, 3000, 4000, 8000, 16000, 20000]
+    length: [0]
 
-  # # TypeScript echo agents — each instance instantiates its own QuickJS runtime
-  # # and JS heap in its own linear memory (the 17.4 MB module is shared once per
-  # # component; the per-instance runtime state is the heavy per-agent cost).
-  # # Heavier per agent than the Rust variant, so a lower knee — but the previous
-  # # run reached 2000 without saturating, so push higher and drop the low points.
-  # - name: throughput-saturation-echo-ts
+  # TypeScript echo agents — each instance instantiates its own QuickJS runtime
+  # and JS heap in its own linear memory (the 17.4 MB module is shared once per
+  # component; the per-instance runtime state is the heavy per-agent cost).
+  # Heavier per agent than the Rust variant, so a lower knee — but the previous
+  # run reached 2000 without saturating, so push higher and drop the low points.
+  - name: throughput-saturation-echo-ts
+    iterations: 3
+    clusterSize: [2]
+    size: [1000, 2000, 4000]
+    length: [0]
+
+  # # Synthetic footprint — each agent retains a deterministic per-agent-distinct
+  # # amount of resident memory, exercising the admission/eviction path with a
+  # # controllable footprint near the limit. Run first: this is the variant that
+  # # actually fills memory and drives the gate to its reject/evict path.
+  # # size   = number of active, memory-holding agents (the ramp axis)
+  # # length = base per-agent memory footprint in bytes; each agent retains a
+  # #          deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base =>
+  # #          ~72 MiB average per agent, filling a ~10 GiB usable pool around
+  # #          ~145 agents. The sweep brackets that ceiling and pushes well past it
+  # #          so the admission gate's reject/evict behaviour near OOM is exercised.
+  # - name: throughput-saturation-counters
   #   iterations: 3
   #   clusterSize: [2]
-  #   size: [1000, 2000, 4000, 6000, 8000]
-  #   length: [0]
-
-  # Synthetic footprint — each agent retains a deterministic per-agent-distinct
-  # amount of resident memory, exercising the admission/eviction path with a
-  # controllable footprint near the limit. Run first: this is the variant that
-  # actually fills memory and drives the gate to its reject/evict path.
-  # size   = number of active, memory-holding agents (the ramp axis)
-  # length = base per-agent memory footprint in bytes; each agent retains a
-  #          deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base =>
-  #          ~72 MiB average per agent, filling a ~10 GiB usable pool around
-  #          ~145 agents. The sweep brackets that ceiling and pushes well past it
-  #          so the admission gate's reject/evict behaviour near OOM is exercised.
-  - name: throughput-saturation-counters
-    iterations: 1
-    clusterSize: [2]
-    size: [50, 100, 150, 200, 300]
-    length: [16777216]
+  #   size: [50, 100, 150, 200, 300]
+  #   length: [16777216]

From 1bd27ea6fb36f5ba0752c6cf893d49b32b356a4c Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Wed, 10 Jun 2026 02:24:19 -0700
Subject: [PATCH 36/60] feat(benchmark): change the steps

---
 .../benchmark_suites/cloud-density-saturation.yaml            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
index bd9819524e..78b0064fa2 100644
--- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml
+++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
@@ -37,7 +37,7 @@ benchmarks:
   - name: throughput-saturation-echo-rust
     iterations: 3
     clusterSize: [2]
-    size: [2000, 3000, 4000, 8000, 16000, 20000]
+    size: [2000, 3000, 4000, 5000, 10000, 15000, 20000]
     length: [0]
 
   # TypeScript echo agents — each instance instantiates its own QuickJS runtime
@@ -48,7 +48,7 @@ benchmarks:
   - name: throughput-saturation-echo-ts
     iterations: 3
     clusterSize: [2]
-    size: [1000, 2000, 4000]
+    size: [1000, 2000, 3000]
     length: [0]
 
   # # Synthetic footprint — each agent retains a deterministic per-agent-distinct

From 898435df30497942fc0b0875a3ae9d09aa3d05b5 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Wed, 10 Jun 2026 16:14:33 -0700
Subject: [PATCH 37/60] feat: replace estimate-semaphore completely with
 measured-headroom admission controller

---
 .../config/debug-worker-executor.sample.env   |   4 -
 .../config/debug-worker-executor.toml         |   4 -
 golem-debugging-service/src/lib.rs            |   3 +-
 .../config/worker-executor.sample.env         |   6 -
 .../config/worker-executor.toml               |   6 -
 .../active_workers/admission/tests.txt        |   9 +
 golem-worker-executor/src/lib.rs              |   3 +-
 golem-worker-executor/src/metrics.rs          |  86 ++--
 .../services/active_workers/admission/mod.rs  | 115 ++++-
 .../active_workers/admission/tests.rs         | 440 +++++++++++++++++-
 .../src/services/active_workers/mod.rs        | 354 +++++---------
 .../src/services/active_workers/tests.rs      | 111 +++++
 .../src/services/golem_config.rs              |  53 +--
 golem-worker-executor/src/worker/mod.rs       |  94 ++--
 14 files changed, 843 insertions(+), 445 deletions(-)
 create mode 100644 golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt

diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env
index 3c87d1275c..4349e54ebe 100644
--- a/golem-debugging-service/config/debug-worker-executor.sample.env
+++ b/golem-debugging-service/config/debug-worker-executor.sample.env
@@ -59,8 +59,6 @@ GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
-GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9
-GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s"
@@ -236,8 +234,6 @@ GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
-GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9
-GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s"
diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml
index 82e5dbdc92..01a81fd83a 100644
--- a/golem-debugging-service/config/debug-worker-executor.toml
+++ b/golem-debugging-service/config/debug-worker-executor.toml
@@ -99,8 +99,6 @@ acquire_retry_delay = "500ms"
 component_size_coefficient = 2.0
 enable_measured_admission = true
 worker_estimate_coefficient = 1.1
-worker_memory_max_safe_ratio = 0.9
-worker_memory_overcommit_ratio = 1.2
 worker_memory_ratio = 0.8
 
 [memory.oom_retry_config]
@@ -371,8 +369,6 @@ without_time = false
 # component_size_coefficient = 2.0
 # enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
-# worker_memory_max_safe_ratio = 0.9
-# worker_memory_overcommit_ratio = 1.2
 # worker_memory_ratio = 0.8
 # 
 # [memory.oom_retry_config]
diff --git a/golem-debugging-service/src/lib.rs b/golem-debugging-service/src/lib.rs
index d6062f2cf1..71e3aeac9c 100644
--- a/golem-debugging-service/src/lib.rs
+++ b/golem-debugging-service/src/lib.rs
@@ -377,7 +377,8 @@ pub async fn run_debug_worker_executor<T: Bootstrap<DebugContext> + ?Sized + Sen
 
     let total_system_memory = golem_config.memory.total_system_memory();
     let system_memory = golem_config.memory.system_memory();
-    let worker_memory = golem_config.memory.worker_memory();
+    let worker_memory =
+        (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64;
     info!(
         "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}",
         ISizeFormatter::new(total_system_memory, humansize::BINARY),
diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env
index 4cd9a25b12..d3c7a04559 100644
--- a/golem-worker-executor/config/worker-executor.sample.env
+++ b/golem-worker-executor/config/worker-executor.sample.env
@@ -76,8 +76,6 @@ GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
-GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9
-GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s"
@@ -299,8 +297,6 @@ GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
-GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9
-GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s"
@@ -492,8 +488,6 @@ GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
-GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9
-GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295
 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s"
diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml
index f5a8cd0183..e77c5f9bfa 100644
--- a/golem-worker-executor/config/worker-executor.toml
+++ b/golem-worker-executor/config/worker-executor.toml
@@ -128,8 +128,6 @@ acquire_retry_delay = "500ms"
 component_size_coefficient = 2.0
 enable_measured_admission = true
 worker_estimate_coefficient = 1.1
-worker_memory_max_safe_ratio = 0.9
-worker_memory_overcommit_ratio = 1.2
 worker_memory_ratio = 0.8
 
 [memory.oom_retry_config]
@@ -463,8 +461,6 @@ without_time = false
 # component_size_coefficient = 2.0
 # enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
-# worker_memory_max_safe_ratio = 0.9
-# worker_memory_overcommit_ratio = 1.2
 # worker_memory_ratio = 0.8
 # 
 # [memory.oom_retry_config]
@@ -768,8 +764,6 @@ without_time = false
 # component_size_coefficient = 2.0
 # enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
-# worker_memory_max_safe_ratio = 0.9
-# worker_memory_overcommit_ratio = 1.2
 # worker_memory_ratio = 0.8
 # 
 # [memory.oom_retry_config]
diff --git a/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt b/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt
new file mode 100644
index 0000000000..eb12d21790
--- /dev/null
+++ b/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt
@@ -0,0 +1,9 @@
+# Seeds for failure cases proptest has generated in the past. It is
+# automatically read and these particular cases re-run before any
+# novel cases are generated.
+#
+# It is recommended to check this file in to source control so that
+# everyone who runs the test benefits from these saved cases.
+cc b49eb145c9dca28d347382d8e482bb2cb6c5d256ccaba7532b370fbadc2bb3fb # shrinks to (limit, residents) = (500, []), schedule = [Admit(220), Admit(92), Admit(189)]
+cc 9727f7e7aab54f8f48e6b856f9d70428fd8503767677fa7c232e27263273e071 # shrinks to limit = 815, schedule = [Grant(485), Grant(1), Grant(7), Exit(1), Grant(1), FaultIn(2, 1), Grant(40), Exit(2), Grant(284)]
+cc 41321d47abd75b283d651e63e40c0f5191b680b908c05879c02d5f36b70de66c # shrinks to (limit, residents) = (1369, [Resident { size: 144, priority: Idle }, Resident { size: 228, priority: Warm }, Resident { size: 152, priority: Warm }, Resident { size: 101, priority: Idle }, Resident { size: 68, priority: Warm }, Resident { size: 45, priority: Idle }, Resident { size: 30, priority: Idle }, Resident { size: 20, priority: Idle }, Resident { size: 13, priority: Warm }, Resident { size: 9, priority: Idle }, Resident { size: 6, priority: Idle }]), schedule = [Admit(270), Admit(785), Admit(250), Admit(146), Admit(456)]
diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs
index a62f944cf0..1eedc9f5e1 100644
--- a/golem-worker-executor/src/lib.rs
+++ b/golem-worker-executor/src/lib.rs
@@ -1002,7 +1002,8 @@ pub async fn bootstrap_and_run_worker_executor<
 
     let total_system_memory = golem_config.memory.total_system_memory();
     let system_memory = golem_config.memory.system_memory();
-    let worker_memory = golem_config.memory.worker_memory();
+    let worker_memory =
+        (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64;
     info!(
         "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}",
         ISizeFormatter::new(total_system_memory, BINARY),
diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs
index de6d673632..b611f9985b 100644
--- a/golem-worker-executor/src/metrics.rs
+++ b/golem-worker-executor/src/metrics.rs
@@ -190,6 +190,46 @@ pub mod workers {
             crate::metrics::BLOB_SIZE_BUCKETS.to_vec()
         )
         .unwrap();
+        pub static ref WORKER_MEMORY_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!(
+            "golem_worker_memory_pool_total_bytes",
+            "Usable memory ceiling (usable_ratio * measured limit) the admission gate admits against on this executor",
+            &["executor_id"]
+        )
+        .unwrap();
+        pub static ref WORKER_MEMORY_POOL_USED_BYTES: GaugeVec = register_gauge_vec!(
+            "golem_worker_memory_pool_used_bytes",
+            "Total linear memory granted to live workers and reserved by the admission gate on this executor",
+            &["executor_id"]
+        )
+        .unwrap();
+        pub static ref WORKER_ADMISSION_RSS_BYTES: GaugeVec = register_gauge_vec!(
+            "golem_worker_admission_rss_bytes",
+            "Measured resident memory (probe snapshot) the admission gate last read on this executor",
+            &["executor_id"]
+        )
+        .unwrap();
+    }
+
+    /// Sets the gate's usable memory ceiling gauge.
+    pub fn record_worker_memory_ceiling(bytes: u64) {
+        WORKER_MEMORY_POOL_TOTAL_BYTES
+            .with_label_values(&[crate::metrics::storage::executor_id()])
+            .set(bytes as f64);
+    }
+
+    /// Sets the gauge of total memory granted to live workers (the gate's
+    /// reservation).
+    pub fn record_worker_memory_granted(bytes: u64) {
+        WORKER_MEMORY_POOL_USED_BYTES
+            .with_label_values(&[crate::metrics::storage::executor_id()])
+            .set(bytes as f64);
+    }
+
+    /// Sets the gauge of measured resident memory last read by the gate.
+    pub fn record_worker_admission_rss(bytes: u64) {
+        WORKER_ADMISSION_RSS_BYTES
+            .with_label_values(&[crate::metrics::storage::executor_id()])
+            .set(bytes as f64);
     }
 
     pub fn record_worker_call(api_name: &'static str) {
@@ -302,18 +342,6 @@ pub mod workers {
         WORKER_FILESYSTEM_SEMAPHORE_AVAILABLE.add(permits.into_f64());
     }
 
-    /// Records acquisition of `bytes` from the worker-memory pool.
-    /// Updates `golem_worker_memory_pool_used_bytes{executor_id}`.
-    pub fn record_memory_permit_acquired(bytes: usize) {
-        crate::metrics::storage::record_worker_memory_pool_acquired(bytes as u64);
-    }
-
-    /// Records release of `bytes` back to the worker-memory pool.
-    /// Updates `golem_worker_memory_pool_used_bytes{executor_id}`.
-    pub fn record_memory_permit_released(bytes: usize) {
-        crate::metrics::storage::record_worker_memory_pool_released(bytes as u64);
-    }
-
     pub fn record_worker_kv_cache_value_size(bytes: usize) {
         WORKER_KV_CACHE_VALUE_SIZE_BYTES
             .with_label_values(&[crate::metrics::storage::executor_id()])
@@ -512,13 +540,13 @@ pub mod wasm {
         .unwrap();
         static ref ALLOCATED_MEMORY_BYTES: Histogram = register_histogram!(
             "allocated_memory_bytes",
-            "Amount of memory allocated by a single memory.grow instruction",
+            "Worker's total linear memory size after a memory.grow, sampled at each grow",
             crate::metrics::MEMORY_SIZE_BUCKETS.to_vec()
         )
         .unwrap();
         static ref WORKER_RESIDENT_LINEAR_MEMORY_BYTES: Histogram = register_histogram!(
             "worker_resident_linear_memory_bytes",
-            "Per-worker cumulative linear-memory ceiling (total_linear_memory_size = sum of memory.grow deltas) sampled at permit acquire. This is the semaphore charge basis (x*ml), an upper bound on resident RSS, NOT measured resident memory (grown pages are largely demand-paged); compare to container_memory_working_set_bytes for the gap",
+            "Per-worker cumulative linear-memory grant (total_linear_memory_size = sum of memory.grow deltas) sampled when the worker is admitted. This is the linear memory the admission gate reserves for the worker; it is an upper bound on resident RSS, not measured resident memory, since grown pages are largely demand-paged. Compare to container_memory_working_set_bytes for the gap.",
             crate::metrics::MEMORY_SIZE_BUCKETS.to_vec()
         )
         .unwrap();
@@ -759,18 +787,6 @@ pub mod storage {
             &["executor_id"]
         )
         .unwrap();
-        pub static ref WORKER_MEMORY_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!(
-            "golem_worker_memory_pool_total_bytes",
-            "Configured worker-memory semaphore size in bytes for this executor",
-            &["executor_id"]
-        )
-        .unwrap();
-        pub static ref WORKER_MEMORY_POOL_USED_BYTES: GaugeVec = register_gauge_vec!(
-            "golem_worker_memory_pool_used_bytes",
-            "Bytes currently acquired from the worker-memory semaphore on this executor",
-            &["executor_id"]
-        )
-        .unwrap();
     }
 
     pub fn record_filesystem_pool_total(bytes: u64) {
@@ -790,22 +806,4 @@ pub mod storage {
             .with_label_values(&[executor_id()])
             .sub(bytes as f64);
     }
-
-    pub fn record_worker_memory_pool_total(bytes: u64) {
-        WORKER_MEMORY_POOL_TOTAL_BYTES
-            .with_label_values(&[executor_id()])
-            .set(bytes as f64);
-    }
-
-    pub fn record_worker_memory_pool_acquired(bytes: u64) {
-        WORKER_MEMORY_POOL_USED_BYTES
-            .with_label_values(&[executor_id()])
-            .add(bytes as f64);
-    }
-
-    pub fn record_worker_memory_pool_released(bytes: u64) {
-        WORKER_MEMORY_POOL_USED_BYTES
-            .with_label_values(&[executor_id()])
-            .sub(bytes as f64);
-    }
 }
diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs
index d89f710859..0008f66773 100644
--- a/golem-worker-executor/src/services/active_workers/admission/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs
@@ -14,15 +14,36 @@
 
 //! Measured-headroom admission decision.
 //!
-//! Gates worker admission on the executor environment's *real* memory headroom
-//! read from the [`MemoryProbe`], rather than on the estimate-based semaphore in
-//! [`super::ActiveWorkers`]. This controller is the primary, authoritative
-//! check against measured resident usage and refuses admission in normal
-//! operation; the estimate semaphore is the second line of defence behind it,
-//! its atomic permit acquisition catching the concurrent admissions this
-//! (lockless) controller can let through on the same snapshot. When headroom is
-//! short it evicts already-resident idle-then-warm work; if it still cannot make
-//! room it rejects rather than over-committing.
+//! Gates worker admission on the executor environment's memory headroom. It is
+//! the sole admission authority: there is no estimate-based semaphore behind it.
+//!
+//! The gate weighs two quantities against the usable ceiling:
+//!
+//! * Measured RSS from the [`MemoryProbe`] (cgroup `memory.current` on a
+//!   constrained pod) — what is resident right now.
+//! * The total linear memory *granted* to live workers — what they could fault
+//!   in at any moment.
+//!
+//! Both matter because they fail in opposite directions. Measured RSS lags
+//! admission: `memory.current` counts only touched pages, so a worker admitted
+//! moments ago is not yet resident and a burst admitted against the same low
+//! snapshot would collectively over-commit. The granted total leads residency: a
+//! worker can fault in any page of the virtual memory it was already granted at
+//! any later time, with no admission call to intercept it, so a gate that
+//! reserved only what is resident would let a node full of lightly-touched
+//! workers OOM by writing into memory they already hold. The gate therefore
+//! reserves the full granted total from admission until unload, and admits
+//! against the *larger* of measured RSS and that granted total — safe against
+//! both the burst race and later faulting of granted pages.
+//!
+//! The granted total is maintained by two integer updates: a worker's grant is
+//! added on admission and removed on unload (via [`AdmissionController::release`]
+//! from the worker lifecycle). The headroom check re-derives the reservation
+//! from this maintained total and the current probe reading, so it is O(1) and
+//! exact regardless of worker churn.
+//!
+//! When headroom is short the controller evicts already-resident idle-then-warm
+//! work; if it still cannot make room it rejects rather than over-committing.
 //!
 //! The controller is decoupled from `Worker`/wasmtime via the [`EvictionSource`]
 //! trait so its decision logic can be exercised in isolation with synthetic
@@ -30,6 +51,7 @@
 
 use super::memory_probe::MemoryProbe;
 use async_trait::async_trait;
+use std::sync::Mutex;
 
 /// Why an eviction candidate is worth evicting, in priority order. Lower
 /// variants are evicted first.
@@ -80,25 +102,81 @@ pub struct AdmissionPolicy {
 }
 
 /// Decides admission against measured headroom, evicting resident idle/warm
-/// work as needed. Holds only its policy and probe; live state is read fresh
-/// from the probe and the eviction source on each call (never cached).
+/// work as needed. Holds its policy and probe; live usage is read fresh from the
+/// probe on each call. The only retained state is `granted`: the total linear
+/// memory granted to live workers, maintained across admit and unload, which the
+/// gate reserves so a worker cannot OOM the node by faulting in granted pages.
 pub struct AdmissionController {
     probe: Box<dyn MemoryProbe>,
     policy: AdmissionPolicy,
+    granted: Mutex<u64>,
 }
 
 impl AdmissionController {
     pub fn new(probe: Box<dyn MemoryProbe>, policy: AdmissionPolicy) -> Self {
-        Self { probe, policy }
+        let ceiling = (probe.snapshot().limit_bytes as f64 * policy.usable_ratio) as u64;
+        crate::metrics::workers::record_worker_memory_ceiling(ceiling);
+        Self {
+            probe,
+            policy,
+            granted: Mutex::new(0),
+        }
     }
 
-    /// Bytes available for new admissions: the carve-out ceiling
-    /// (`usable_ratio × limit`) minus current usage. Saturating — never
-    /// underflows when already over the ceiling.
+    /// Bytes available for a new admission: the usable ceiling minus the larger
+    /// of measured RSS and the total memory granted to live workers. Saturating —
+    /// never underflows when already over the ceiling.
+    ///
+    /// A worker can fault in any page of the virtual memory it was granted at any
+    /// time, with no admission call to intercept it, so the gate must reserve the
+    /// full granted total even before it is resident. Measured RSS is only larger
+    /// than the granted total transiently (host/runtime overhead the grant does
+    /// not cover), so taking the maximum keeps the gate safe against both the
+    /// grant a worker may yet fault in and any usage the grant does not capture.
     fn admissible_headroom(&self) -> u64 {
         let snapshot = self.probe.snapshot();
         let ceiling = (snapshot.limit_bytes as f64 * self.policy.usable_ratio) as u64;
-        ceiling.saturating_sub(snapshot.current_bytes)
+        let granted = *self.granted.lock().unwrap();
+        crate::metrics::workers::record_worker_memory_ceiling(ceiling);
+        crate::metrics::workers::record_worker_admission_rss(snapshot.current_bytes);
+        ceiling.saturating_sub(snapshot.current_bytes.max(granted))
+    }
+
+    /// Record `request_bytes` of memory granted to a newly admitted worker. The
+    /// gate reserves this until the worker unloads, because the worker may fault
+    /// the granted pages in at any later time.
+    fn reserve(&self, request_bytes: u64) {
+        let mut granted = self.granted.lock().unwrap();
+        *granted += request_bytes;
+        crate::metrics::workers::record_worker_memory_granted(*granted);
+    }
+
+    /// Reserve memory for a cost that is a committed consequence of an already
+    /// admitted worker rather than a fresh admission — currently a component's
+    /// compiled module, loaded into RAM when the first worker of the component
+    /// becomes resident and shared by all its workers. Unlike admission this does
+    /// not evict or reject (the worker is already in); it accounts the bytes so
+    /// later admissions see them. Released with [`Self::release`].
+    pub fn reserve_committed(&self, bytes: u64) {
+        self.reserve(bytes);
+    }
+
+    /// Release the grant of a worker that has unloaded, given the bytes it was
+    /// granted. Its pages leave memory, so its grant no longer needs reserving;
+    /// not releasing it would permanently shrink admissible headroom as workers
+    /// come and go.
+    pub fn release(&self, reserved_bytes: u64) {
+        let mut granted = self.granted.lock().unwrap();
+        *granted = granted.saturating_sub(reserved_bytes);
+        crate::metrics::workers::record_worker_memory_granted(*granted);
+    }
+
+    /// Pre-register grant bytes for workers that were already live when the
+    /// controller was created. Test-only: production registers every worker's
+    /// grant through admission.
+    #[cfg(test)]
+    pub fn seed_granted(&self, bytes: u64) {
+        *self.granted.lock().unwrap() += bytes;
     }
 
     /// Decide whether `request_bytes` can be admitted, evicting from `source` if
@@ -107,7 +185,8 @@ impl AdmissionController {
     /// Eviction is attempted idle-first, then warm, and only up to the shortfall
     /// (never evicts when headroom already suffices). After eviction the
     /// headroom is re-measured against ground truth; the request is admitted only
-    /// if the real headroom now covers it, otherwise it is rejected.
+    /// if the real headroom now covers it, otherwise it is rejected. On admit the
+    /// request is added to the in-flight reservation.
     pub async fn try_admit(
         &self,
         request_bytes: u64,
@@ -115,6 +194,7 @@ impl AdmissionController {
     ) -> AdmissionDecision {
         // Fast path: enough real headroom already, admit without evicting.
         if self.admissible_headroom() >= request_bytes {
+            self.reserve(request_bytes);
             return AdmissionDecision::Admit;
         }
 
@@ -134,6 +214,7 @@ impl AdmissionController {
         // the probe is the authority, and other activity may have moved usage
         // in either direction while we were evicting.
         if self.admissible_headroom() >= request_bytes {
+            self.reserve(request_bytes);
             AdmissionDecision::Admit
         } else {
             AdmissionDecision::Reject
diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs
index 4996b97be7..24e9b3e119 100644
--- a/golem-worker-executor/src/services/active_workers/admission/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs
@@ -47,6 +47,20 @@ struct Resident {
     priority: EvictionPriority,
 }
 
+/// An admitted request whose pages have not yet fully faulted into RSS.
+///
+/// Models the gap between admission and residency: the worker has been admitted
+/// for `reserved` bytes but only `resident` of them have actually touched memory
+/// so far. Real RSS (what the probe reads) reflects only `resident`; the
+/// remaining `reserved - resident` bytes are still in flight and will appear in
+/// RSS later. This lag is what lets concurrent admissions on the same RSS
+/// snapshot collectively over-commit.
+#[derive(Debug, Clone, Copy)]
+struct InFlight {
+    reserved: u64,
+    resident: u64,
+}
+
 /// Shared model of the executor environment's memory.
 #[derive(Debug, Default)]
 struct EnvState {
@@ -56,6 +70,10 @@ struct EnvState {
     pinned_usage: u64,
     /// Resident, evictable work — what the controller may reclaim.
     residents: Vec<Resident>,
+    /// Admitted requests whose pages are still faulting in. Their `resident`
+    /// portion counts toward measured RSS now; their full `reserved` size is
+    /// what RSS will reach once they are fully resident.
+    in_flight: Vec<InFlight>,
     /// Count of evictions performed, for the no-spurious-eviction property.
     evictions: usize,
     /// The priorities evicted, in order, for the ordering property.
@@ -63,8 +81,64 @@ struct EnvState {
 }
 
 impl EnvState {
+    /// Measured RSS: the bytes that have actually faulted in. Lags behind what
+    /// has been admitted, because in-flight requests are only partially
+    /// resident. This is what the probe reports.
     fn usage(&self) -> u64 {
-        self.pinned_usage + self.residents.iter().map(|r| r.size).sum::<u64>()
+        self.pinned_usage
+            + self.residents.iter().map(|r| r.size).sum::<u64>()
+            + self.in_flight.iter().map(|f| f.resident).sum::<u64>()
+    }
+
+    /// Total bytes that admitted work will eventually occupy once every
+    /// in-flight request has fully faulted in. The safety property is stated
+    /// against this value: reserved bytes always become resident, so if this
+    /// can exceed the limit the environment will OOM once the lag resolves.
+    fn eventual_usage(&self) -> u64 {
+        self.pinned_usage
+            + self.residents.iter().map(|r| r.size).sum::<u64>()
+            + self.in_flight.iter().map(|f| f.reserved).sum::<u64>()
+    }
+
+    /// Advance residency: each in-flight request faults in up to `step` more of
+    /// its reserved bytes, raising measured RSS toward its eventual size.
+    /// Fully-resident requests are retired into `pinned_usage`.
+    fn tick_residency(&mut self, step: u64) {
+        for f in &mut self.in_flight {
+            let remaining = f.reserved - f.resident;
+            f.resident += remaining.min(step);
+        }
+        let (done, pending): (Vec<_>, Vec<_>) = self
+            .in_flight
+            .drain(..)
+            .partition(|f| f.resident >= f.reserved);
+        self.pinned_usage += done.iter().map(|f| f.reserved).sum::<u64>();
+        self.in_flight = pending;
+    }
+
+    /// Fault in `step` bytes of granted-but-untouched memory belonging to the
+    /// in-flight request at `index`, without faulting in any other request. A
+    /// worker may touch the virtual memory it was already granted at any later
+    /// time, with no admission call in the loop, so this raises measured RSS for
+    /// one worker in isolation.
+    fn fault_in_one(&mut self, index: usize, step: u64) {
+        if let Some(f) = self.in_flight.get_mut(index) {
+            let remaining = f.reserved - f.resident;
+            f.resident += remaining.min(step);
+        }
+    }
+
+    /// Remove the in-flight worker at `index`: it finishes and unloads, freeing
+    /// both its resident pages and its remaining grant. Measured RSS drops by its
+    /// resident portion. Returns the bytes it was admitted for, so the caller can
+    /// release the gate's reservation for it. The surviving workers' reservations
+    /// for their own untouched grants must not be credited by this drop.
+    fn exit_one(&mut self, index: usize) -> Option<u64> {
+        if index < self.in_flight.len() {
+            Some(self.in_flight.remove(index).reserved)
+        } else {
+            None
+        }
     }
 }
 
@@ -85,6 +159,9 @@ impl MemoryProbe for FakeProbe {
 
 struct FakeEvictionSource {
     state: Arc<Mutex<EnvState>>,
+    /// The gate, so eviction can release each evicted resident's grant — in
+    /// production, eviction unloads the worker, which releases its grant.
+    controller: Arc<AdmissionController>,
 }
 
 #[async_trait::async_trait]
@@ -99,6 +176,7 @@ impl EvictionSource for FakeEvictionSource {
             if state.residents[i].priority == priority {
                 let victim = state.residents.remove(i);
                 freed += victim.size;
+                self.controller.release(victim.size);
                 state.evictions += 1;
                 state.eviction_order.push(priority);
             } else {
@@ -109,17 +187,35 @@ impl EvictionSource for FakeEvictionSource {
     }
 }
 
-fn controller(state: Arc<Mutex<EnvState>>) -> AdmissionController {
+fn controller(state: Arc<Mutex<EnvState>>) -> Arc<AdmissionController> {
     controller_with_ratio(state, 1.0)
 }
 
-fn controller_with_ratio(state: Arc<Mutex<EnvState>>, usable_ratio: f64) -> AdmissionController {
-    AdmissionController::new(
+fn controller_with_ratio(
+    state: Arc<Mutex<EnvState>>,
+    usable_ratio: f64,
+) -> Arc<AdmissionController> {
+    // Workers already resident when the gate is created had their grants
+    // registered at their own admission; seed the gate to match.
+    let initial_granted = {
+        let s = state.lock().unwrap();
+        s.pinned_usage + s.residents.iter().map(|r| r.size).sum::<u64>()
+    };
+    let controller = AdmissionController::new(
         Box::new(FakeProbe {
             state: state.clone(),
         }),
         AdmissionPolicy { usable_ratio },
-    )
+    );
+    controller.seed_granted(initial_granted);
+    Arc::new(controller)
+}
+
+fn eviction_source(
+    state: Arc<Mutex<EnvState>>,
+    controller: Arc<AdmissionController>,
+) -> FakeEvictionSource {
+    FakeEvictionSource { state, controller }
 }
 
 /// Apply one admission attempt against the model, mutating `usage` on admit.
@@ -136,6 +232,28 @@ async fn apply_admit(
     decision
 }
 
+/// Apply one admission attempt where admitted bytes do NOT become resident
+/// immediately. On admit the request is recorded as in-flight with zero resident
+/// bytes, so measured RSS is unchanged until a later residency tick faults its
+/// pages in. This models the real lag between admission and RSS, the window in
+/// which concurrent admissions on the same snapshot can collectively
+/// over-commit.
+async fn apply_staggered_admit(
+    controller: &AdmissionController,
+    source: &FakeEvictionSource,
+    state: &Arc<Mutex<EnvState>>,
+    request: u64,
+) -> AdmissionDecision {
+    let decision = controller.try_admit(request, source).await;
+    if decision == AdmissionDecision::Admit {
+        state.lock().unwrap().in_flight.push(InFlight {
+            reserved: request,
+            resident: 0,
+        });
+    }
+    decision
+}
+
 // ── Single-case unit tests ───────────────────────────────────────────────────
 
 #[test]
@@ -150,9 +268,7 @@ async fn admits_when_headroom_is_ample_without_evicting() {
         ..Default::default()
     }));
     let ctrl = controller(state.clone());
-    let source = FakeEvictionSource {
-        state: state.clone(),
-    };
+    let source = eviction_source(state.clone(), ctrl.clone());
 
     let decision = apply_admit(&ctrl, &source, &state, 200).await;
     assert_eq!(decision, AdmissionDecision::Admit);
@@ -180,9 +296,7 @@ async fn evicts_idle_before_warm() {
     // usage = 800, limit = 1000, headroom = 200. Request 300 → shortfall 100.
     // One idle (400) covers it; warm must remain untouched.
     let ctrl = controller(state.clone());
-    let source = FakeEvictionSource {
-        state: state.clone(),
-    };
+    let source = eviction_source(state.clone(), ctrl.clone());
 
     let decision = apply_admit(&ctrl, &source, &state, 300).await;
     assert_eq!(decision, AdmissionDecision::Admit);
@@ -202,9 +316,7 @@ async fn rejects_when_nothing_can_be_freed() {
         ..Default::default()
     }));
     let ctrl = controller(state.clone());
-    let source = FakeEvictionSource {
-        state: state.clone(),
-    };
+    let source = eviction_source(state.clone(), ctrl.clone());
 
     let decision = apply_admit(&ctrl, &source, &state, 200).await;
     assert_eq!(decision, AdmissionDecision::Reject);
@@ -219,6 +331,17 @@ enum Op {
     Admit(u64),
 }
 
+/// An operation in a staggered-start schedule. Unlike [`Op`], admitted bytes do
+/// not become resident immediately — `Tick` advances residency separately, so
+/// the schedule can interleave admissions and page-faulting in any order.
+#[derive(Debug, Clone)]
+enum StaggeredOp {
+    /// Attempt to admit a worker reserving this many bytes.
+    Admit(u64),
+    /// Fault in up to this many more bytes of every in-flight worker.
+    Tick(u64),
+}
+
 fn arb_resident_priority() -> impl Strategy<Value = EvictionPriority> {
     prop_oneof![Just(EvictionPriority::Idle), Just(EvictionPriority::Warm)]
 }
@@ -279,7 +402,7 @@ proptest! {
                 ..Default::default()
             }));
             let ctrl = controller(state.clone());
-            let source = FakeEvictionSource { state: state.clone() };
+            let source = eviction_source(state.clone(), ctrl.clone());
 
             for op in ops {
                 match op {
@@ -318,7 +441,7 @@ proptest! {
                 ..Default::default()
             }));
             let ctrl = controller(state.clone());
-            let source = FakeEvictionSource { state: state.clone() };
+            let source = eviction_source(state.clone(), ctrl.clone());
 
             for op in ops {
                 match op {
@@ -349,7 +472,7 @@ proptest! {
                 ..Default::default()
             }));
             let ctrl = controller(state.clone());
-            let source = FakeEvictionSource { state: state.clone() };
+            let source = eviction_source(state.clone(), ctrl.clone());
 
             for op in ops {
                 match op {
@@ -375,6 +498,285 @@ proptest! {
     }
 }
 
+// ── Staggered-start safety ───────────────────────────────────────────────────
+
+/// A schedule of admissions interleaved with residency ticks. Admissions
+/// reserve bytes that only become resident when a later `Tick` faults them in,
+/// so the schedule exercises the lag between admission and measured RSS in which
+/// concurrent admissions can collectively over-commit. Skewed toward `Admit` so
+/// bursts of admissions land between ticks (the dangerous case).
+fn arb_staggered_schedule() -> impl Strategy<Value = Vec<StaggeredOp>> {
+    prop::collection::vec(
+        prop_oneof![
+            3 => (1u64..800).prop_map(StaggeredOp::Admit),
+            1 => (1u64..800).prop_map(StaggeredOp::Tick),
+        ],
+        0..60,
+    )
+}
+
+proptest! {
+    /// Safety invariant under staggered starts: for any interleaving of
+    /// admissions and residency ticks, once every admitted worker has fully
+    /// faulted its pages in, resident usage must not exceed the limit.
+    ///
+    /// Reserved bytes always eventually become resident, so the check is made
+    /// against the state after a final full-residency tick: if that can exceed
+    /// the limit, the environment OOMs once the admission lag resolves. This is
+    /// the general form of the staggered-burst case — admissions that read the
+    /// same low RSS snapshot before each other's pages are counted.
+    #[test]
+    fn staggered_starts_never_exceed_limit_once_resident(
+        (limit, residents) in arb_fitting_state(500..5000, 20),
+        schedule in arb_staggered_schedule(),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState {
+                limit,
+                pinned_usage: 0,
+                residents,
+                ..Default::default()
+            }));
+            let ctrl = controller(state.clone());
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            for op in schedule {
+                match op {
+                    StaggeredOp::Admit(req) => {
+                        apply_staggered_admit(&ctrl, &source, &state, req).await;
+                    }
+                    StaggeredOp::Tick(step) => {
+                        state.lock().unwrap().tick_residency(step);
+                    }
+                }
+                // Even mid-flight, measured RSS must never exceed the limit.
+                let s = state.lock().unwrap();
+                prop_assert!(
+                    s.usage() <= s.limit,
+                    "resident usage {} exceeded limit {} mid-schedule", s.usage(), s.limit
+                );
+            }
+
+            // Fault in everything still in flight, then check the eventual
+            // resident footprint fits.
+            state.lock().unwrap().tick_residency(u64::MAX);
+            let s = state.lock().unwrap();
+            prop_assert!(
+                s.eventual_usage() <= s.limit,
+                "eventual resident usage {} exceeded limit {} once fully resident",
+                s.eventual_usage(), s.limit
+            );
+            Ok(())
+        }).unwrap();
+    }
+}
+
+// ── Granted virtual memory ───────────────────────────────────────────────────
+
+/// One step of a schedule that stresses granted-but-untouched memory.
+#[derive(Debug, Clone)]
+enum GrantOp {
+    /// Attempt to admit a worker granted this many bytes of linear memory.
+    Grant(u64),
+    /// Fault in up to this many bytes of the in-flight worker at this index,
+    /// in isolation from the others.
+    FaultIn(usize, u64),
+    /// The in-flight worker at this index finishes and unloads, dropping its
+    /// resident pages and its remaining grant.
+    Exit(usize),
+}
+
+fn arb_grant_schedule() -> impl Strategy<Value = Vec<GrantOp>> {
+    prop::collection::vec(
+        prop_oneof![
+            3 => (1u64..800).prop_map(GrantOp::Grant),
+            3 => (0usize..20, 1u64..800).prop_map(|(i, step)| GrantOp::FaultIn(i, step)),
+            1 => (0usize..20).prop_map(GrantOp::Exit),
+        ],
+        0..80,
+    )
+}
+
+proptest! {
+    /// A worker may fault in the virtual memory it was already granted at any
+    /// later time, with no admission call in the loop. Once every granted byte
+    /// of every admitted worker becomes resident, that resident footprint must
+    /// not exceed the limit.
+    ///
+    /// Granted bytes can always become resident — nothing in the runtime forces
+    /// a worker to leave granted pages untouched — so the safety check is made
+    /// against the sum of granted sizes after faulting everything in. If that
+    /// can exceed the limit, a node of workers touching their already-granted
+    /// pages will OOM with no grow and no admission to intercept it.
+    #[test]
+    fn granted_memory_never_exceeds_limit_once_faulted_in(
+        limit in 800u64..6000,
+        schedule in arb_grant_schedule(),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState { limit, ..Default::default() }));
+            // usable_ratio 1.0 isolates the granted-memory hole from the host
+            // carve-out.
+            let ctrl = controller(state.clone());
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            for op in schedule {
+                match op {
+                    GrantOp::Grant(bytes) => {
+                        apply_staggered_admit(&ctrl, &source, &state, bytes).await;
+                    }
+                    GrantOp::FaultIn(index, step) => {
+                        state.lock().unwrap().fault_in_one(index, step);
+                    }
+                    GrantOp::Exit(index) => {
+                        let reserved = state.lock().unwrap().exit_one(index);
+                        if let Some(reserved) = reserved {
+                            ctrl.release(reserved);
+                        }
+                    }
+                }
+                let s = state.lock().unwrap();
+                prop_assert!(
+                    s.usage() <= s.limit,
+                    "resident usage {} exceeded limit {} mid-schedule", s.usage(), s.limit
+                );
+            }
+
+            // Every granted byte may yet fault in. Once it all does, it must fit.
+            state.lock().unwrap().tick_residency(u64::MAX);
+            let s = state.lock().unwrap();
+            prop_assert!(
+                s.eventual_usage() <= s.limit,
+                "granted memory {} exceeded limit {} once fully faulted in",
+                s.eventual_usage(), s.limit
+            );
+            Ok(())
+        }).unwrap();
+    }
+
+    /// Liveness: once every admitted worker has unloaded and its pages have left
+    /// memory, the gate's admissible headroom must return to the full ceiling.
+    ///
+    /// Reservations for workers that exit while still holding untouched granted
+    /// memory must be released on unload. If they were not, each such exit would
+    /// permanently shrink headroom, and a node churning workers would slowly
+    /// refuse all admissions despite being empty.
+    #[test]
+    fn headroom_recovers_after_all_workers_exit(
+        limit in 800u64..6000,
+        schedule in arb_grant_schedule(),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let usable_ratio = 0.8;
+            let state = Arc::new(Mutex::new(EnvState { limit, ..Default::default() }));
+            let ctrl = controller_with_ratio(state.clone(), usable_ratio);
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            for op in schedule {
+                match op {
+                    GrantOp::Grant(bytes) => {
+                        apply_staggered_admit(&ctrl, &source, &state, bytes).await;
+                    }
+                    GrantOp::FaultIn(index, step) => {
+                        state.lock().unwrap().fault_in_one(index, step);
+                    }
+                    GrantOp::Exit(index) => {
+                        let reserved = state.lock().unwrap().exit_one(index);
+                        if let Some(reserved) = reserved {
+                            ctrl.release(reserved);
+                        }
+                    }
+                }
+            }
+
+            // Unload every worker still resident, releasing each reservation, and
+            // clear measured RSS — the environment is now empty.
+            loop {
+                let reserved = state.lock().unwrap().exit_one(0);
+                match reserved {
+                    Some(reserved) => ctrl.release(reserved),
+                    None => break,
+                }
+            }
+            {
+                let mut s = state.lock().unwrap();
+                s.pinned_usage = 0;
+                s.residents.clear();
+            }
+
+            let ceiling = (limit as f64 * usable_ratio) as u64;
+            let headroom = ctrl.headroom_bytes();
+            prop_assert_eq!(
+                headroom, ceiling,
+                "headroom {} did not recover to ceiling {} after all workers exited",
+                headroom, ceiling
+            );
+            Ok(())
+        }).unwrap();
+    }
+}
+
+// ── Density ──────────────────────────────────────────────────────────────────
+
+proptest! {
+    /// Density invariant: in a settled state (no admission lag outstanding), the
+    /// gate packs the environment to within one request of the usable ceiling
+    /// before it starts rejecting. It must not stop admitting while substantial
+    /// usable room remains.
+    ///
+    /// The schedule admits a fixed request size, fully faulting each admitted
+    /// worker in before the next admit so measured RSS tracks admitted bytes and
+    /// the in-flight reservation drains to zero — the steady-state regime where
+    /// density matters. At the first rejection, resident usage must be at least
+    /// `ceiling - request`: the only room a correct gate may leave free is the
+    /// part too small to fit one more request.
+    #[test]
+    fn admits_to_within_one_request_of_the_ceiling(
+        limit in 2000u64..20_000,
+        request in 50u64..600,
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let usable_ratio = 0.8;
+            let state = Arc::new(Mutex::new(EnvState {
+                limit,
+                ..Default::default()
+            }));
+            let ctrl = controller_with_ratio(state.clone(), usable_ratio);
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            let ceiling = (limit as f64 * usable_ratio) as u64;
+
+            // Admit until the first rejection, faulting each worker fully in
+            // before the next so no reservation lag is outstanding.
+            let mut rejected = false;
+            for _ in 0..((limit / request) + 2) {
+                let decision = apply_staggered_admit(&ctrl, &source, &state, request).await;
+                state.lock().unwrap().tick_residency(u64::MAX);
+                if decision == AdmissionDecision::Reject {
+                    rejected = true;
+                    break;
+                }
+            }
+
+            prop_assert!(rejected, "gate never rejected; ceiling {ceiling} too large for the schedule");
+
+            let s = state.lock().unwrap();
+            prop_assert!(
+                s.usage() + request > ceiling,
+                "gate rejected at resident usage {} with ceiling {ceiling}: left more than one request ({request}) of usable room free",
+                s.usage()
+            );
+            // And it must never have over-committed.
+            prop_assert!(s.eventual_usage() <= s.limit);
+            Ok(())
+        }).unwrap();
+    }
+}
+
 // ── Carve-out ratio ──────────────────────────────────────────────────────────
 
 #[test]
@@ -388,9 +790,7 @@ async fn usable_ratio_caps_admission_below_full_limit() {
     // ceiling = 0.8 * 1000 = 800. Request 850 must be rejected even though the
     // raw limit (1000) would allow it — the top 20% is reserved for the host.
     let ctrl = controller_with_ratio(state.clone(), 0.8);
-    let source = FakeEvictionSource {
-        state: state.clone(),
-    };
+    let source = eviction_source(state.clone(), ctrl.clone());
 
     assert_eq!(
         apply_admit(&ctrl, &source, &state, 850).await,
diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs
index 0b8e02fa38..b428674a07 100644
--- a/golem-worker-executor/src/services/active_workers/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/mod.rs
@@ -36,7 +36,6 @@ use component_charge::{ChargeSource, ComponentChargeGuard, ComponentChargeRegist
 use memory_probe::default_probe;
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::sync::{Mutex, OwnedSemaphorePermit, Semaphore, TryAcquireError};
 
 use tracing::{Instrument, debug};
 
@@ -73,26 +72,21 @@ impl RegisteredConcurrentAccount {
 /// Holds the metadata and wasmtime structures of currently active Golem workers
 pub struct ActiveWorkers<Ctx: WorkerCtx> {
     workers: Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
-    worker_memory: Arc<Semaphore>,
     worker_filesystem_storage: Arc<FilesystemStorageSemaphore>,
     concurrent_agents: Arc<ConcurrentAgentsScheduler>,
-    priority_allocation_lock: Arc<Mutex<()>>,
     acquire_retry_delay: Duration,
-    /// Authoritative measured-headroom admission gate. Decides whether real
-    /// memory headroom permits a new acquisition, evicting via the worker set
-    /// when short, and is what refuses admission in normal operation. The
-    /// estimate-based `worker_memory` semaphore is the second line of defence
-    /// behind it: its atomic permit acquisition catches the concurrent
-    /// admissions the lockless gate can let through on the same snapshot. `None`
-    /// when measured admission is disabled (e.g. shared test environments) —
-    /// admission then relies on the estimate semaphore alone.
-    admission: Option<AdmissionController>,
-    /// Charges each resident component's compiled module size to the estimate
-    /// pool exactly once (shared across all its workers) rather than per worker.
-    component_charges:
-        Arc<ComponentChargeRegistry<ComponentChargeKey, MemoryPoolChargeSource<Ctx>>>,
+    /// Authoritative measured-headroom admission gate, and the sole admission
+    /// authority. Decides whether real memory headroom permits a new
+    /// acquisition, evicting via the worker set when short. `None` when measured
+    /// admission is disabled (e.g. shared test environments), in which case
+    /// acquisition always proceeds.
+    admission: Option<Arc<AdmissionController>>,
+    /// Reserves each resident component's compiled module size with the gate
+    /// exactly once (shared across all its workers) rather than per worker, so
+    /// the module's resident cost is accounted before it faults into memory.
+    component_charges: Arc<ComponentChargeRegistry<ComponentChargeKey, GateChargeSource>>,
     /// Multiplier applied to a component's `component_size` when sizing its
-    /// module charge permit.
+    /// module charge.
     component_size_coefficient: f64,
 }
 
@@ -100,98 +94,56 @@ pub struct ActiveWorkers<Ctx: WorkerCtx> {
 type ComponentChargeKey = (ComponentId, ComponentRevision);
 
 /// Guard held by a resident worker keeping its component's module charge alive.
-pub type WorkerComponentCharge<Ctx> =
-    ComponentChargeGuard<ComponentChargeKey, MemoryPoolChargeSource<Ctx>>;
-
-#[derive(Debug)]
-pub struct WorkerMemoryPermit {
-    permit: Option<OwnedSemaphorePermit>,
-}
-
-impl WorkerMemoryPermit {
-    fn new(permit: OwnedSemaphorePermit) -> Self {
-        crate::metrics::workers::record_memory_permit_acquired(permit.num_permits());
-        Self {
-            permit: Some(permit),
-        }
-    }
-
-    pub fn num_permits(&self) -> usize {
-        self.permit
-            .as_ref()
-            .map_or(0, |permit| permit.num_permits())
-    }
-
-    pub fn merge(&mut self, mut other: Self) {
-        if let Some(other_permit) = other.permit.take() {
-            match &mut self.permit {
-                Some(permit) => permit.merge(other_permit),
-                None => self.permit = Some(other_permit),
-            }
-        }
-    }
-}
-
-impl Drop for WorkerMemoryPermit {
-    fn drop(&mut self) {
-        crate::metrics::workers::record_memory_permit_released(self.num_permits());
-    }
-}
+pub type WorkerComponentCharge = ComponentChargeGuard<ComponentChargeKey, GateChargeSource>;
 
 impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
     pub fn new(memory_config: &MemoryConfig, storage_config: &FilesystemStorageConfig) -> Self {
-        // Build the probe once and size both admission layers from its reported
-        // limit, so the estimate semaphore and the measured-headroom gate share
-        // a single basis (the pod's cgroup limit when constrained, not host RAM).
+        // Build the probe once and hand it to the measured-headroom gate, which
+        // bases its decision on the pod's cgroup limit when constrained (not host
+        // RAM).
         let probe = default_probe(memory_config.system_memory_override);
-        let worker_memory_size = memory_config.worker_memory_for_limit(probe.limit_bytes());
-        let admission = memory_config
-            .enable_measured_admission
-            .then(|| AdmissionController::new(probe, memory_config.admission_policy()));
+        let admission = memory_config.enable_measured_admission.then(|| {
+            Arc::new(AdmissionController::new(
+                probe,
+                memory_config.admission_policy(),
+            ))
+        });
         let workers = Cache::new(
             None,
             FullCacheEvictionMode::None,
             BackgroundEvictionMode::None,
             "active_workers",
         );
-        let worker_memory = Arc::new(Semaphore::new(worker_memory_size));
-        let priority_allocation_lock = Arc::new(Mutex::new(()));
-        let component_charges = ComponentChargeRegistry::new(MemoryPoolChargeSource {
-            worker_memory: worker_memory.clone(),
-            workers: workers.clone(),
-            priority_allocation_lock: priority_allocation_lock.clone(),
-            acquire_retry_delay: memory_config.acquire_retry_delay,
+        let component_charges = ComponentChargeRegistry::new(GateChargeSource {
+            admission: admission.clone(),
         });
         let active_workers = Self {
             workers,
-            worker_memory,
             worker_filesystem_storage: Arc::new(FilesystemStorageSemaphore::new(
                 storage_config.worker_filesystem_storage(),
                 storage_config.acquire_retry_delay,
             )),
             concurrent_agents: Arc::new(ConcurrentAgentsScheduler::new()),
             acquire_retry_delay: memory_config.acquire_retry_delay,
-            priority_allocation_lock,
             admission,
             component_charges,
             component_size_coefficient: memory_config.component_size_coefficient,
         };
-        active_workers.initialize_metrics(worker_memory_size);
+        active_workers.initialize_metrics();
         active_workers
     }
 
     /// Acquire (or share) the per-component module charge for a worker of the
-    /// given component. The first resident worker of the component pays its
-    /// compiled-module size (scaled by `component_size_coefficient`) into the
-    /// estimate pool; subsequent workers share the same charge. The returned
-    /// guard releases residency on drop, and the charge is freed when the last
-    /// worker of the component unloads.
+    /// given component. The first resident worker of the component reserves its
+    /// compiled-module size (scaled by `component_size_coefficient`) with the
+    /// gate; subsequent workers share the same charge. The returned guard
+    /// releases the charge when the last worker of the component unloads.
     pub async fn acquire_component_charge(
         &self,
         component_id: ComponentId,
         component_revision: ComponentRevision,
         component_module_bytes: u64,
-    ) -> WorkerComponentCharge<Ctx> {
+    ) -> WorkerComponentCharge {
         let charge_bytes = (self.component_size_coefficient * component_module_bytes as f64) as u64;
         self.component_charges
             .acquire((component_id, component_revision), charge_bytes)
@@ -270,52 +222,31 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
         }
     }
 
-    pub async fn acquire(&self, memory: u64) -> WorkerMemoryPermit {
-        let mem32: u32 = memory
-            .try_into()
-            .expect("requested memory size is too large");
-
+    /// Blocking memory admission for a starting worker. Loops until the gate
+    /// admits the request, backing off between attempts.
+    ///
+    /// A rejection is transient, not terminal. The gate reads resident memory
+    /// from the probe, which lags real usage (cgroup `memory.current` only counts
+    /// already-touched pages), so a worker admitted earlier may not yet be fully
+    /// resident; pressure eases as its pages settle and as other workers finish.
+    /// Each iteration backs off and re-reads the gate, so the caller eventually
+    /// proceeds once headroom recovers rather than failing under momentary
+    /// pressure. With measured admission disabled the worker is admitted
+    /// immediately.
+    pub async fn acquire(&self, memory: u64) {
+        let Some(admission) = &self.admission else {
+            return;
+        };
         loop {
-            // Blocking acquire: retry until the request can be admitted. A
-            // rejection here is transient, not terminal. The gate reads resident
-            // memory from the probe, which lags real usage (cgroup
-            // `memory.current` only counts already-touched pages), so a worker
-            // admitted earlier may not yet be fully resident; pressure eases as
-            // its pages settle and as other workers finish and release pool
-            // permits. Each iteration backs off, re-reads the gate, and re-tries
-            // the pool, so the caller eventually proceeds once headroom recovers
-            // rather than failing under momentary pressure.
-            // Authoritative measured-headroom gate (when enabled). Evicts
-            // idle-then-warm when real headroom is short; rejects (and we back
-            // off) when it cannot make room rather than risking the limit.
-            if let Some(admission) = &self.admission
-                && admission.try_admit(memory, &self.eviction_source()).await
-                    == AdmissionDecision::Reject
-            {
-                debug!("Measured headroom insufficient for {mem32}, backing off and retrying");
-                tokio::time::sleep(self.acquire_retry_delay).await;
-                continue;
-            }
-
-            // Estimate-semaphore pool: the second line of defence behind the
-            // gate. Its atomic permit acquisition catches the concurrent
-            // admissions the lockless gate can let through on the same snapshot.
-            // Sized above the gate ceiling (but clamped below the limit), so it
-            // rarely binds first — the gate refuses in normal operation.
-            if let Some(permit) = acquire_pool_permit(
-                &self.worker_memory,
-                &self.workers,
-                &self.priority_allocation_lock,
-                self.acquire_retry_delay,
-                mem32,
-                memory,
-            )
-            .await
+            // Evicts idle-then-warm when real headroom is short; rejects (and we
+            // back off) when it cannot make room rather than risking the limit.
+            if admission.try_admit(memory, &self.eviction_source()).await
+                == AdmissionDecision::Admit
             {
-                break permit;
+                return;
             }
-            // Pool could not satisfy the estimate even after eviction; loop and
-            // re-run the gate before trying again.
+            debug!("Measured headroom insufficient for {memory}, backing off and retrying");
+            tokio::time::sleep(self.acquire_retry_delay).await;
         }
     }
 
@@ -327,51 +258,31 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
         }
     }
 
-    pub async fn try_acquire(&self, memory: u64) -> Option<WorkerMemoryPermit> {
-        let mem32: u32 = memory
-            .try_into()
-            .expect("requested memory size is too large");
-
-        // Authoritative measured-headroom gate (when enabled). Single attempt
-        // (this is the non-blocking path): if real headroom is insufficient even
-        // after eviction, do not admit.
-        if let Some(admission) = &self.admission
-            && admission.try_admit(memory, &self.eviction_source()).await
-                == AdmissionDecision::Reject
-        {
-            debug!("Measured headroom insufficient for {mem32}, not admitting");
-            return None;
+    /// Non-blocking memory admission for a growing worker. A single gate attempt:
+    /// returns `true` when the grow is admitted, `false` when real headroom is
+    /// insufficient even after eviction (the caller turns this into a retriable
+    /// out-of-memory trap). With measured admission disabled the grow is always
+    /// admitted.
+    pub async fn try_acquire(&self, memory: u64) -> bool {
+        let Some(admission) = &self.admission else {
+            return true;
+        };
+        match admission.try_admit(memory, &self.eviction_source()).await {
+            AdmissionDecision::Admit => true,
+            AdmissionDecision::Reject => {
+                debug!("Measured headroom insufficient for {memory}, not admitting");
+                false
+            }
         }
+    }
 
-        let mut lock = None;
-        loop {
-            match self.worker_memory.clone().try_acquire_many_owned(mem32) {
-                Ok(permit) => {
-                    debug!(
-                        "Acquired {} memory of {}",
-                        mem32,
-                        self.worker_memory.available_permits()
-                    );
-                    break Some(WorkerMemoryPermit::new(permit));
-                }
-                Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"),
-                Err(TryAcquireError::NoPermits) => {
-                    if lock.is_none() {
-                        debug!(
-                            "Not enough available memory to acquire {mem32} (available: {}), cancelling waiting acquires and retry",
-                            self.worker_memory.available_permits()
-                        );
-                        lock = Some(self.priority_allocation_lock.lock().await);
-                        continue;
-                    } else {
-                        debug!(
-                            "Not enough available memory to acquire {mem32} (available: {})",
-                            self.worker_memory.available_permits()
-                        );
-                        break None;
-                    }
-                }
-            }
+    /// Release the memory a worker reserved with the admission gate when it
+    /// unloads. `bytes` must be the cumulative amount the worker reserved through
+    /// [`Self::acquire`] and [`Self::try_acquire`], so the gate's granted total
+    /// stays symmetric. No-op when measured admission is disabled.
+    pub fn release_memory(&self, bytes: u64) {
+        if let Some(admission) = &self.admission {
+            admission.release(bytes);
         }
     }
 
@@ -488,12 +399,11 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
     }
 
     /// Initializes worker gauges. Subsequent changes are recorded inline at the mutation sites.
-    fn initialize_metrics(&self, worker_memory_size: usize) {
+    fn initialize_metrics(&self) {
         crate::metrics::workers::initialize_worker_metrics();
         crate::metrics::workers::set_filesystem_semaphore_available(
             self.worker_filesystem_storage.available_bytes(),
         );
-        crate::metrics::storage::record_worker_memory_pool_total(worker_memory_size as u64);
     }
 }
 
@@ -547,62 +457,7 @@ async fn evict_at_most_memory<Ctx: WorkerCtx>(
     freed
 }
 
-/// Frees up to `memory` estimate-permit bytes by evicting idle-then-warm
-/// workers, accounting for permits already available. Returns true when enough
-/// is (or was already) free.
-async fn try_free_up_pool_memory<Ctx: WorkerCtx>(
-    worker_memory: &Semaphore,
-    workers: &Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
-    memory: u64,
-) -> bool {
-    let current_avail = worker_memory.available_permits();
-    let needed = memory.saturating_sub(current_avail as u64);
-    if needed == 0 {
-        return true;
-    }
-
-    let mut freed = 0u64;
-    for priority in [EvictionPriority::Idle, EvictionPriority::Warm] {
-        if freed >= needed {
-            break;
-        }
-        freed += evict_at_most_memory(workers, priority, needed - freed).await;
-    }
-    freed >= needed
-}
-
-/// Single estimate-semaphore acquisition attempt with eviction. Returns the
-/// permit on success, or `None` when the pool cannot satisfy `mem32` even after
-/// evicting idle/warm workers (caller decides whether to retry). Shared by
-/// `ActiveWorkers::acquire` and the per-component charge source so there is one
-/// pool-acquire implementation.
-async fn acquire_pool_permit<Ctx: WorkerCtx>(
-    worker_memory: &Arc<Semaphore>,
-    workers: &Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
-    priority_allocation_lock: &Mutex<()>,
-    acquire_retry_delay: Duration,
-    mem32: u32,
-    memory: u64,
-) -> Option<WorkerMemoryPermit> {
-    let lock = priority_allocation_lock.lock().await; // Block trying until a priority request is retrying once
-    let result = worker_memory.clone().try_acquire_many_owned(mem32);
-    drop(lock);
-    match result {
-        Ok(permit) => Some(WorkerMemoryPermit::new(permit)),
-        Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"),
-        Err(TryAcquireError::NoPermits) => {
-            if try_free_up_pool_memory(worker_memory, workers, memory).await {
-                // Freed enough; signal the caller to retry the acquire.
-                None
-            } else {
-                // Could not free enough; wait before the caller retries.
-                tokio::time::sleep(acquire_retry_delay).await;
-                None
-            }
-        }
-    }
-}
-
+/// A source of evictable, already-resident memory the gate reclaims through.
 struct WorkerEvictionSource<Ctx: WorkerCtx> {
     workers: Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
 }
@@ -614,36 +469,41 @@ impl<Ctx: WorkerCtx> EvictionSource for WorkerEvictionSource<Ctx> {
     }
 }
 
-/// Production [`ChargeSource`] for the per-component module charge. Takes
-/// estimate-semaphore permits via the same pool acquire+evict path as worker
-/// memory (the measured-headroom gate already accounts for the resident module
-/// via real RSS, so the charge does not pass through it).
-pub struct MemoryPoolChargeSource<Ctx: WorkerCtx> {
-    worker_memory: Arc<Semaphore>,
-    workers: Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
-    priority_allocation_lock: Arc<Mutex<()>>,
-    acquire_retry_delay: Duration,
+/// Production [`ChargeSource`] for the per-component module charge: reserves the
+/// module's bytes with the measured-headroom gate. The module is a committed
+/// consequence of admitting the first worker of a component (it loads into RAM
+/// when that worker becomes resident), so it is reserved rather than admitted —
+/// it neither evicts nor can be refused. `None` when measured admission is
+/// disabled, in which case the charge is a no-op.
+pub struct GateChargeSource {
+    admission: Option<Arc<AdmissionController>>,
+}
+
+/// Held module charge: releases its reserved bytes from the gate on drop.
+pub struct GateCharge {
+    admission: Option<Arc<AdmissionController>>,
+    bytes: u64,
+}
+
+impl Drop for GateCharge {
+    fn drop(&mut self) {
+        if let Some(admission) = &self.admission {
+            admission.release(self.bytes);
+        }
+    }
 }
 
 #[async_trait]
-impl<Ctx: WorkerCtx> ChargeSource for MemoryPoolChargeSource<Ctx> {
-    type Charge = WorkerMemoryPermit;
+impl ChargeSource for GateChargeSource {
+    type Charge = GateCharge;
 
-    async fn acquire_charge(&self, bytes: u64) -> WorkerMemoryPermit {
-        let mem32: u32 = bytes.try_into().expect("component charge size too large");
-        loop {
-            if let Some(permit) = acquire_pool_permit(
-                &self.worker_memory,
-                &self.workers,
-                &self.priority_allocation_lock,
-                self.acquire_retry_delay,
-                mem32,
-                bytes,
-            )
-            .await
-            {
-                break permit;
-            }
+    async fn acquire_charge(&self, bytes: u64) -> GateCharge {
+        if let Some(admission) = &self.admission {
+            admission.reserve_committed(bytes);
+        }
+        GateCharge {
+            admission: self.admission.clone(),
+            bytes,
         }
     }
 }
diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs
index 82430c243b..074576eb54 100644
--- a/golem-worker-executor/src/services/active_workers/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/tests.rs
@@ -729,3 +729,114 @@ async fn scheduler_accounts_are_independent() {
     drop(a1);
     drop(a2);
 }
+
+// ── Component module charge against the admission gate ───────────────────────
+
+mod component_module_charge {
+    use super::super::admission::{AdmissionController, AdmissionPolicy};
+    use super::super::component_charge::ComponentChargeRegistry;
+    use super::super::memory_probe::{MemoryProbe, MemorySnapshot};
+    use super::super::{ComponentChargeKey, GateChargeSource, HeldComponentCharge};
+    use golem_common::model::component::{ComponentId, ComponentRevision};
+    use std::sync::Arc;
+    use test_r::test;
+    use uuid::Uuid;
+
+    /// Probe reporting a fixed limit and zero resident memory, so the gate's
+    /// reservation is driven entirely by what is charged through it.
+    #[derive(Debug)]
+    struct FixedProbe {
+        limit: u64,
+    }
+
+    impl MemoryProbe for FixedProbe {
+        fn snapshot(&self) -> MemorySnapshot {
+            MemorySnapshot {
+                limit_bytes: self.limit,
+                current_bytes: 0,
+            }
+        }
+    }
+
+    fn key() -> ComponentChargeKey {
+        (ComponentId(Uuid::new_v4()), ComponentRevision::INITIAL)
+    }
+
+    /// The first worker of a component reserves the module's bytes with the gate,
+    /// so admissible headroom drops by the module size before it faults into
+    /// memory. A second worker of the same component reserves nothing more, and
+    /// the reservation is released only when the last worker unloads.
+    #[test]
+    async fn module_charge_reserves_with_gate_until_last_worker_unloads() {
+        let limit = 1000u64;
+        let module_bytes = 200u64;
+        let controller = Arc::new(AdmissionController::new(
+            Box::new(FixedProbe { limit }),
+            AdmissionPolicy { usable_ratio: 1.0 },
+        ));
+        let registry = ComponentChargeRegistry::new(GateChargeSource {
+            admission: Some(controller.clone()),
+        });
+        let component = key();
+
+        assert_eq!(controller.headroom_bytes(), limit);
+
+        let first = registry.acquire(component.clone(), module_bytes).await;
+        assert_eq!(
+            controller.headroom_bytes(),
+            limit - module_bytes,
+            "first worker of a component must reserve the module size with the gate"
+        );
+
+        let second = registry.acquire(component.clone(), module_bytes).await;
+        assert_eq!(
+            controller.headroom_bytes(),
+            limit - module_bytes,
+            "a second worker of the same component must not reserve the module again"
+        );
+
+        drop(first);
+        assert_eq!(
+            controller.headroom_bytes(),
+            limit - module_bytes,
+            "the module stays reserved while any worker of the component is resident"
+        );
+
+        drop(second);
+        assert_eq!(
+            controller.headroom_bytes(),
+            limit,
+            "the module reservation is released when the last worker unloads"
+        );
+    }
+
+    /// A `RunningWorker` stores its component charge as
+    /// `Box<dyn HeldComponentCharge>` and releases it by dropping that box when
+    /// the worker unloads. Dropping the box must still release the module
+    /// reservation with the gate, i.e. the concrete charge's release runs through
+    /// the trait object exactly as it would for a live worker.
+    #[test]
+    async fn dropping_boxed_charge_releases_the_reservation() {
+        let limit = 1000u64;
+        let module_bytes = 200u64;
+        let controller = Arc::new(AdmissionController::new(
+            Box::new(FixedProbe { limit }),
+            AdmissionPolicy { usable_ratio: 1.0 },
+        ));
+        let registry = ComponentChargeRegistry::new(GateChargeSource {
+            admission: Some(controller.clone()),
+        });
+
+        let charge = registry.acquire(key(), module_bytes).await;
+        // Store it exactly as RunningWorker does.
+        let boxed: Box<dyn HeldComponentCharge> = Box::new(charge);
+        assert_eq!(controller.headroom_bytes(), limit - module_bytes);
+
+        drop(boxed);
+        assert_eq!(
+            controller.headroom_bytes(),
+            limit,
+            "dropping the boxed charge (as on worker unload) must release the reservation"
+        );
+    }
+}
diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs
index 9a53176160..4ff9f0a00c 100644
--- a/golem-worker-executor/src/services/golem_config.rs
+++ b/golem-worker-executor/src/services/golem_config.rs
@@ -963,30 +963,15 @@ pub struct MemoryConfig {
     pub system_memory_override: Option<u64>,
     pub worker_memory_ratio: f64,
     pub worker_estimate_coefficient: f64,
-    /// Multiplier applied to a component's `component_size`, charged once per
-    /// resident component (shared across all its workers) rather than per worker.
+    /// Multiplier applied to a component's `component_size` when reserving its
+    /// compiled-module memory with the admission gate, charged once per resident
+    /// component (shared across all its workers) rather than per worker.
     pub component_size_coefficient: f64,
-    /// Multiplier (typically > 1.0) applied to the measured limit when sizing the
-    /// estimate semaphore. The estimate per worker is normally larger than its
-    /// real resident usage, so the semaphore is allowed to authorize more
-    /// estimated bytes than the limit: it is the second line of defence behind
-    /// the measured-headroom gate, catching the concurrent-admission race the
-    /// (lockless) gate cannot, while the gate refuses first in normal operation
-    /// against real usage. Always clamped by `worker_memory_max_safe_ratio` so it
-    /// can never itself authorise real usage past a safe fraction of the limit.
-    pub worker_memory_overcommit_ratio: f64,
-    /// Hard upper bound (fraction of the measured limit, < 1.0) on the estimate
-    /// semaphore size, regardless of `worker_memory_overcommit_ratio`. Keeps the
-    /// semaphore below the true limit so headroom always remains for the wasmtime
-    /// host even if the semaphore is the binding guard and estimates happen to
-    /// match real usage.
-    pub worker_memory_max_safe_ratio: f64,
     /// Whether the measured-headroom admission gate is active. Requires the
     /// executor to own its memory environment (its own cgroup/process), as in a
     /// production pod. Disable in shared environments — such as the in-process
     /// test harness — where the probe cannot isolate this executor's footprint
-    /// from co-resident processes; admission then relies on the estimate
-    /// semaphore alone.
+    /// from co-resident processes.
     pub enable_measured_admission: bool,
     #[serde(with = "humantime_serde")]
     pub acquire_retry_delay: Duration,
@@ -1010,24 +995,6 @@ impl MemoryConfig {
         sysinfo.available_memory()
     }
 
-    /// Size of the estimate semaphore: the measured limit scaled by the
-    /// overcommit ratio, then clamped to `worker_memory_max_safe_ratio` of the
-    /// limit. The overcommit lets the semaphore sit slightly above the gate
-    /// ceiling as a second line of defence (per-worker estimates exceed real
-    /// usage, so it rarely binds first); the clamp guarantees it can never be
-    /// sized to authorise real usage past a safe fraction of the limit, leaving
-    /// headroom for the wasmtime host.
-    pub fn worker_memory_for_limit(&self, limit_bytes: u64) -> usize {
-        let limit = limit_bytes as f64;
-        let overcommit = limit * self.worker_memory_overcommit_ratio;
-        let safe_cap = limit * self.worker_memory_max_safe_ratio;
-        overcommit.min(safe_cap) as usize
-    }
-
-    pub fn worker_memory(&self) -> usize {
-        self.worker_memory_for_limit(self.total_system_memory())
-    }
-
     /// The admission policy for the measured-headroom gate. Reuses
     /// `worker_memory_ratio` as the usable fraction of the measured limit (the
     /// host keeps the remainder).
@@ -1059,16 +1026,6 @@ impl SafeDisplay for MemoryConfig {
             "component size coefficient: {}",
             self.component_size_coefficient
         );
-        let _ = writeln!(
-            &mut result,
-            "worker memory overcommit ratio: {}",
-            self.worker_memory_overcommit_ratio
-        );
-        let _ = writeln!(
-            &mut result,
-            "worker memory max safe ratio: {}",
-            self.worker_memory_max_safe_ratio
-        );
         let _ = writeln!(
             &mut result,
             "measured admission enabled: {}",
@@ -1599,8 +1556,6 @@ impl Default for MemoryConfig {
             worker_memory_ratio: 0.8,
             worker_estimate_coefficient: 1.1,
             component_size_coefficient: 2.0,
-            worker_memory_overcommit_ratio: 1.2,
-            worker_memory_max_safe_ratio: 0.9,
             enable_measured_admission: true,
             acquire_retry_delay: Duration::from_millis(500),
             oom_retry_config: RetryConfig {
diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs
index efd692f7c4..bd1ead3243 100644
--- a/golem-worker-executor/src/worker/mod.rs
+++ b/golem-worker-executor/src/worker/mod.rs
@@ -28,7 +28,7 @@ use crate::metrics::storage::record_filesystem_pool_released;
 use crate::model::{AgentConfig, ExecutionStatus, LookupResult, ReadFileResult, TrapType};
 use crate::services::active_workers::{
     FilesystemStoragePermit, HeldComponentCharge, RegisteredConcurrentAccount,
-    WorkerComponentCharge, WorkerMemoryPermit,
+    WorkerComponentCharge,
 };
 use crate::services::events::{Event, EventsSubscription};
 use crate::services::golem_config::SnapshotPolicy;
@@ -137,6 +137,11 @@ pub struct Worker<Ctx: WorkerCtx> {
     /// at least that many bytes from the blocking eviction path, ensuring
     /// enough idle workers are evicted to satisfy the pending write.
     desired_extra_filesystem_storage: AtomicU64,
+    /// Cumulative memory bytes this worker has reserved with the admission gate:
+    /// its initial requirement plus every grow delta. Released back to the gate
+    /// in full when the worker unloads, so the gate's granted total stays exactly
+    /// symmetric with what was reserved.
+    granted_memory: AtomicU64,
 }
 
 impl<Ctx: WorkerCtx> HasOplog for Worker<Ctx> {
@@ -349,6 +354,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             last_resume_request: Mutex::new(Timestamp::now_utc()),
             snapshot_recovery_disabled: AtomicBool::new(false),
             desired_extra_filesystem_storage: AtomicU64::new(0),
+            granted_memory: AtomicU64::new(0),
         };
 
         // Wire the worker event service into the forwarding oplog so plugin errors
@@ -985,12 +991,11 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
 
     // Should only be called from invocation loop
     pub async fn increase_memory(&self, delta: u64) -> anyhow::Result<()> {
-        // The instance lock must not be held while acquiring memory permits:
-        // permit acquisition runs the admission eviction scan, which takes other
-        // workers' instance locks. Holding this worker's instance lock across
-        // that scan while another growing worker does the same is an AB-BA
-        // deadlock. So acquire the permit without the lock, then re-lock only to
-        // merge it into the running worker.
+        // The instance lock must not be held while running the admission gate:
+        // it may run the eviction scan, which takes other workers' instance
+        // locks. Holding this worker's instance lock across that scan while
+        // another growing worker does the same is an AB-BA deadlock. So check the
+        // state, release the lock, then run the gate.
         match &*self.instance.lock().await {
             WorkerInstance::Running(_) => {}
             WorkerInstance::Stopping(_)
@@ -999,23 +1004,22 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             | WorkerInstance::Deleting => return Ok(()),
         }
 
-        let Some(new_permits) = self.active_workers().try_acquire(delta).await else {
-            return Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory));
-        };
+        if self.active_workers().try_acquire(delta).await {
+            self.granted_memory.fetch_add(delta, Ordering::Relaxed);
+            Ok(())
+        } else {
+            Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory))
+        }
+    }
 
-        // Re-check state under the lock: the worker may have changed state while
-        // permits were being acquired. If it is no longer running, drop the
-        // permits (returned to the pool on drop) and treat as a no-op, matching
-        // the non-running arms above.
-        match &mut *self.instance.lock().await {
-            WorkerInstance::Running(running) => {
-                running.merge_extra_permits(new_permits);
-                Ok(())
-            }
-            WorkerInstance::Stopping(_)
-            | WorkerInstance::WaitingForPermit(_)
-            | WorkerInstance::Unloaded { .. }
-            | WorkerInstance::Deleting => Ok(()),
+    /// Release this worker's entire accumulated memory grant back to the
+    /// admission gate, resetting the running total to zero. Called when the
+    /// worker stops being resident; a later reload re-accumulates the grant from
+    /// scratch through the acquire path.
+    fn release_granted_memory(&self) {
+        let granted = self.granted_memory.swap(0, Ordering::Relaxed);
+        if granted > 0 {
+            self.active_workers().release_memory(granted);
         }
     }
 
@@ -1667,14 +1671,15 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
                 // when stopping via the invocation loop we can stop immediately, no need to go via the stopping status
                 if called_from_invocation_loop {
                     crate::metrics::workers::dec_worker_memory_resident();
+                    self.release_granted_memory();
                     **instance_guard = final_state.into_instance();
                     StopResult::Stopped
                 } else {
                     // drop the running worker, this signals to the invocation loop to start exiting.
-                    // RunningWorker::drop releases the memory permit, so dec resident here.
                     let run_loop_handle = running.stop();
                     let notify = OneShotEvent::new();
                     crate::metrics::workers::dec_worker_memory_resident();
+                    self.release_granted_memory();
                     **instance_guard = WorkerInstance::Stopping(StoppingWorker {
                         notify: notify.clone(),
                         final_state,
@@ -2223,8 +2228,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
 
     async fn start_waiting_worker(
         this: Arc<Worker<Ctx>>,
-        permit: WorkerMemoryPermit,
-        component_charge: WorkerComponentCharge<Ctx>,
+        component_charge: WorkerComponentCharge,
         filesystem_storage_permit: Option<FilesystemStoragePermit>,
         concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit,
         oom_retry_count: u32,
@@ -2239,7 +2243,6 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
                     this.owned_agent_id.clone(),
                     this.queue.clone(),
                     this.clone(),
-                    permit,
                     component_charge,
                     concurrent_agent_permit,
                     oom_retry_count,
@@ -2254,6 +2257,9 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             }
             _ => {
                 debug!("worker was not waiting for permit anymore, not starting");
+                // The grant was reserved before this call; the worker is not
+                // becoming resident, so release it rather than leak it.
+                this.release_granted_memory();
             }
         }
     }
@@ -2391,13 +2397,17 @@ impl WaitingWorker {
                 let agent_id = parent.owned_agent_id.agent_id();
                 let registered_concurrent_account = parent.registered_concurrent_account.clone();
                 let concurrent_agent_permit = registered_concurrent_account.acquire(agent_id).await;
-                // Do not reserve executor memory while waiting for a per-account
-                // concurrency slot. Otherwise one account could fill the memory
-                // pool with workers that are not allowed to run yet.
-                let permit = parent.active_workers().acquire(memory_requirement).await;
-                // Charge the component's compiled module size once per resident
+                // Do not gate executor memory while waiting for a per-account
+                // concurrency slot. Otherwise one account could exhaust the
+                // memory headroom with workers that are not allowed to run yet.
+                parent.active_workers().acquire(memory_requirement).await;
+                parent
+                    .granted_memory
+                    .fetch_add(memory_requirement, Ordering::Relaxed);
+                // Reserve the component's compiled module size once per resident
                 // component (shared by all its workers). Held for as long as this
-                // worker is resident.
+                // worker is resident; the module faults into RAM when the first
+                // worker loads, so reserving it keeps later admissions honest.
                 let component_charge = match parent.component_charge_requirement().await {
                     Ok((component_id, component_revision, component_module_bytes)) => {
                         parent
@@ -2413,6 +2423,7 @@ impl WaitingWorker {
                         warn!(
                             "Failed to determine component charge requirement, not starting: {err}"
                         );
+                        parent.release_granted_memory();
                         return;
                     }
                 };
@@ -2466,7 +2477,6 @@ impl WaitingWorker {
                 debug!("Attempting to start worker after acquiring enough permits");
                 Worker::start_waiting_worker(
                     parent,
-                    permit,
                     component_charge,
                     filesystem_storage_permit,
                     concurrent_agent_permit,
@@ -2499,11 +2509,9 @@ struct RunningWorker {
     handle: Option<JoinHandle<()>>,
     sender: UnboundedSender<WorkerCommand>,
     queue: Arc<RwLock<VecDeque<QueuedWorkerInvocation>>>,
-    permit: WorkerMemoryPermit,
-    /// Keeps this worker's component module charge alive for as long as the
-    /// worker is resident. Held only to be dropped: dropping it releases the
-    /// component's residency, and the module charge if this was the last worker
-    /// of the component.
+    /// Keeps this worker's component module charge alive while it is resident.
+    /// Held only to be dropped: dropping it releases the component's residency
+    /// (and the module reservation if this was the last worker of the component).
     #[allow(dead_code)]
     component_charge: Box<dyn HeldComponentCharge>,
     /// Storage semaphore permits held by this worker. `None` until storage
@@ -2536,8 +2544,7 @@ impl RunningWorker {
         owned_agent_id: OwnedAgentId,
         queue: Arc<RwLock<VecDeque<QueuedWorkerInvocation>>>,
         parent: Arc<Worker<Ctx>>,
-        permit: WorkerMemoryPermit,
-        component_charge: WorkerComponentCharge<Ctx>,
+        component_charge: WorkerComponentCharge,
         concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit,
         oom_retry_count: u32,
     ) -> Self {
@@ -2587,7 +2594,6 @@ impl RunningWorker {
             handle: Some(handle),
             sender,
             queue,
-            permit,
             component_charge: Box::new(component_charge),
             filesystem_storage_permit: None,
             waiting_for_command,
@@ -2596,10 +2602,6 @@ impl RunningWorker {
         }
     }
 
-    pub fn merge_extra_permits(&mut self, extra_permit: WorkerMemoryPermit) {
-        self.permit.merge(extra_permit);
-    }
-
     /// Merge additional storage permits into this worker's storage permit. If
     /// the worker does not yet hold a storage permit, the given permit becomes
     /// the initial one. Additional calls merge into that initial permit.

From 8cecf91c4da8e1be6af64007897e98e2b062a75b Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Wed, 10 Jun 2026 16:34:26 -0700
Subject: [PATCH 38/60] fix: clippy warnings

---
 golem-worker-executor/src/services/active_workers/tests.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs
index 074576eb54..53481b4d18 100644
--- a/golem-worker-executor/src/services/active_workers/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/tests.rs
@@ -781,14 +781,14 @@ mod component_module_charge {
 
         assert_eq!(controller.headroom_bytes(), limit);
 
-        let first = registry.acquire(component.clone(), module_bytes).await;
+        let first = registry.acquire(component, module_bytes).await;
         assert_eq!(
             controller.headroom_bytes(),
             limit - module_bytes,
             "first worker of a component must reserve the module size with the gate"
         );
 
-        let second = registry.acquire(component.clone(), module_bytes).await;
+        let second = registry.acquire(component, module_bytes).await;
         assert_eq!(
             controller.headroom_bytes(),
             limit - module_bytes,

From 8566f132bb02bce7a3833b48edd29324b983abd5 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Wed, 10 Jun 2026 19:06:27 -0700
Subject: [PATCH 39/60] fix: startup message regarding memory

---
 golem-debugging-service/src/lib.rs               | 13 +++++++++----
 golem-worker-executor/src/lib.rs                 | 12 ++++++++----
 .../src/services/golem_config.rs                 | 16 ----------------
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/golem-debugging-service/src/lib.rs b/golem-debugging-service/src/lib.rs
index 71e3aeac9c..2dea553b6b 100644
--- a/golem-debugging-service/src/lib.rs
+++ b/golem-debugging-service/src/lib.rs
@@ -375,14 +375,19 @@ pub async fn run_debug_worker_executor<T: Bootstrap<DebugContext> + ?Sized + Sen
 ) -> anyhow::Result<RunDetails> {
     debug!("Initializing debug worker executor");
 
-    let total_system_memory = golem_config.memory.total_system_memory();
-    let system_memory = golem_config.memory.system_memory();
+    let memory_snapshot =
+        golem_worker_executor::services::active_workers::memory_probe::default_probe(
+            golem_config.memory.system_memory_override,
+        )
+        .snapshot();
+    let total_system_memory = memory_snapshot.limit_bytes;
+    let used_system_memory = memory_snapshot.current_bytes;
     let worker_memory =
         (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64;
     info!(
-        "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}",
+        "Measured memory limit: {}, Currently used: {}, Usable for workers: {}",
         ISizeFormatter::new(total_system_memory, humansize::BINARY),
-        ISizeFormatter::new(system_memory, humansize::BINARY),
+        ISizeFormatter::new(used_system_memory, humansize::BINARY),
         ISizeFormatter::new(worker_memory, humansize::BINARY)
     );
 
diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs
index 1eedc9f5e1..f2df280bff 100644
--- a/golem-worker-executor/src/lib.rs
+++ b/golem-worker-executor/src/lib.rs
@@ -1000,14 +1000,18 @@ pub async fn bootstrap_and_run_worker_executor<
 ) -> anyhow::Result<RunDetails> {
     debug!("Initializing worker executor");
 
-    let total_system_memory = golem_config.memory.total_system_memory();
-    let system_memory = golem_config.memory.system_memory();
+    let memory_snapshot = crate::services::active_workers::memory_probe::default_probe(
+        golem_config.memory.system_memory_override,
+    )
+    .snapshot();
+    let total_system_memory = memory_snapshot.limit_bytes;
+    let used_system_memory = memory_snapshot.current_bytes;
     let worker_memory =
         (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64;
     info!(
-        "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}",
+        "Measured memory limit: {}, Currently used: {}, Usable for workers: {}",
         ISizeFormatter::new(total_system_memory, BINARY),
-        ISizeFormatter::new(system_memory, BINARY),
+        ISizeFormatter::new(used_system_memory, BINARY),
         ISizeFormatter::new(worker_memory, BINARY)
     );
 
diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs
index 4ff9f0a00c..a11a411f77 100644
--- a/golem-worker-executor/src/services/golem_config.rs
+++ b/golem-worker-executor/src/services/golem_config.rs
@@ -979,22 +979,6 @@ pub struct MemoryConfig {
 }
 
 impl MemoryConfig {
-    /// The memory limit this executor must stay under, resolved through the same
-    /// probe the admission gate uses: the cgroup `memory.max` of the pod on a
-    /// constrained Linux deployment, the configured override when set, and host
-    /// RAM only when the process is genuinely unconstrained. In a container this
-    /// is the pod's ceiling, not the host's total RAM.
-    pub fn total_system_memory(&self) -> u64 {
-        crate::services::active_workers::memory_probe::default_probe(self.system_memory_override)
-            .limit_bytes()
-    }
-
-    pub fn system_memory(&self) -> u64 {
-        let mut sysinfo = sysinfo::System::new();
-        sysinfo.refresh_memory();
-        sysinfo.available_memory()
-    }
-
     /// The admission policy for the measured-headroom gate. Reuses
     /// `worker_memory_ratio` as the usable fraction of the measured limit (the
     /// host keeps the remainder).

From 626e4bac29b50894d67aea0bd6135cb194d3e91f Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Wed, 10 Jun 2026 20:25:46 -0700
Subject: [PATCH 40/60] chore: run only oom test

---
 .../cloud-density-saturation.yaml             | 70 +++++++++----------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
index 78b0064fa2..1d7a477661 100644
--- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml
+++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
@@ -28,41 +28,41 @@
 
 name: cloud-density-saturation
 benchmarks:
-  # Rust echo agents — lean per-instance linear memory (the ~900 KB module is
-  # charged once per component, shared across all agents; what scales per agent
-  # is the small instance heap). The previous run reached the top of the sweep
-  # (12000) without saturating pod memory, so the knee here is throughput /
-  # eviction-churn rather than memory. Dropped the low points that told us
-  # nothing and pushed the range up with coarser steps.
-  - name: throughput-saturation-echo-rust
-    iterations: 3
-    clusterSize: [2]
-    size: [2000, 3000, 4000, 5000, 10000, 15000, 20000]
-    length: [0]
-
-  # TypeScript echo agents — each instance instantiates its own QuickJS runtime
-  # and JS heap in its own linear memory (the 17.4 MB module is shared once per
-  # component; the per-instance runtime state is the heavy per-agent cost).
-  # Heavier per agent than the Rust variant, so a lower knee — but the previous
-  # run reached 2000 without saturating, so push higher and drop the low points.
-  - name: throughput-saturation-echo-ts
-    iterations: 3
-    clusterSize: [2]
-    size: [1000, 2000, 3000]
-    length: [0]
+  # # Rust echo agents — lean per-instance linear memory (the ~900 KB module is
+  # # charged once per component, shared across all agents; what scales per agent
+  # # is the small instance heap). The previous run reached the top of the sweep
+  # # (12000) without saturating pod memory, so the knee here is throughput /
+  # # eviction-churn rather than memory. Dropped the low points that told us
+  # # nothing and pushed the range up with coarser steps.
+  # - name: throughput-saturation-echo-rust
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [2000, 3000, 4000, 5000, 10000, 15000, 20000]
+  #   length: [0]
 
-  # # Synthetic footprint — each agent retains a deterministic per-agent-distinct
-  # # amount of resident memory, exercising the admission/eviction path with a
-  # # controllable footprint near the limit. Run first: this is the variant that
-  # # actually fills memory and drives the gate to its reject/evict path.
-  # # size   = number of active, memory-holding agents (the ramp axis)
-  # # length = base per-agent memory footprint in bytes; each agent retains a
-  # #          deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base =>
-  # #          ~72 MiB average per agent, filling a ~10 GiB usable pool around
-  # #          ~145 agents. The sweep brackets that ceiling and pushes well past it
-  # #          so the admission gate's reject/evict behaviour near OOM is exercised.
-  # - name: throughput-saturation-counters
+  # # TypeScript echo agents — each instance instantiates its own QuickJS runtime
+  # # and JS heap in its own linear memory (the 17.4 MB module is shared once per
+  # # component; the per-instance runtime state is the heavy per-agent cost).
+  # # Heavier per agent than the Rust variant, so a lower knee — but the previous
+  # # run reached 2000 without saturating, so push higher and drop the low points.
+  # - name: throughput-saturation-echo-ts
   #   iterations: 3
   #   clusterSize: [2]
-  #   size: [50, 100, 150, 200, 300]
-  #   length: [16777216]
+  #   size: [1000, 2000, 3000]
+  #   length: [0]
+
+  # Synthetic footprint — each agent retains a deterministic per-agent-distinct
+  # amount of resident memory, exercising the admission/eviction path with a
+  # controllable footprint near the limit. Run first: this is the variant that
+  # actually fills memory and drives the gate to its reject/evict path.
+  # size   = number of active, memory-holding agents (the ramp axis)
+  # length = base per-agent memory footprint in bytes; each agent retains a
+  #          deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base =>
+  #          ~72 MiB average per agent, filling a ~10 GiB usable pool around
+  #          ~145 agents. The sweep brackets that ceiling and pushes well past it
+  #          so the admission gate's reject/evict behaviour near OOM is exercised.
+  - name: throughput-saturation-counters
+    iterations: 1
+    clusterSize: [2]
+    size: [50, 100, 150, 200, 300]
+    length: [16777216]

From 24340479cc2edb573f22ee34dc675b9801b2bfae Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Wed, 10 Jun 2026 21:05:46 -0700
Subject: [PATCH 41/60] feat: enable to whole perf test suite

---
 .../benchmark_suites/cloud-perf.yaml          | 174 +++++++++---------
 1 file changed, 87 insertions(+), 87 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index 21ef48352a..ea8ce74403 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -29,102 +29,102 @@ benchmarks:
   # size   = number of workers per implementation (×6 implementations total)
   # length = unused for echo
   - name: throughput-echo
-    iterations: 3
+    iterations: 1
     clusterSize: [2]
-    size: [1, 10, 50, 100, 250]
+    size: [1, 50, 100, 250]
     length: [1000]
 
-  # # size   = number of workers per implementation
-  # # length = payload size in bytes sent to large_input
-  # # NOTE: large payloads grow worker linear memory, so this is the throughput
-  # # benchmark most relevant to the memory-admission investigation — sized to
-  # # match throughput-echo so it exercises real density.
-  # - name: throughput-large-input
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 50, 100, 250]
-  #   length: [100, 10000]
+  # size   = number of workers per implementation
+  # length = payload size in bytes sent to large_input
+  # NOTE: large payloads grow worker linear memory, so this is the throughput
+  # benchmark most relevant to the memory-admission investigation — sized to
+  # match throughput-echo so it exercises real density.
+  - name: throughput-large-input
+    iterations: 1
+    clusterSize: [2]
+    size: [1, 50, 100, 250]
+    length: [100, 10000]
 
-  # # size   = number of workers per implementation
-  # # length = CPU work length passed to cpu_intensive
-  # - name: throughput-cpu-intensive
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 50, 100, 250]
-  #   length: [100]
+  # size   = number of workers per implementation
+  # length = CPU work length passed to cpu_intensive
+  - name: throughput-cpu-intensive
+    iterations: 1
+    clusterSize: [2]
+    size: [1, 50, 100, 250]
+    length: [100]
 
-  # # Cold-start: compilation cache disabled — measures true cold-start latency
-  # # with no warm compiled artefact available.
-  # # size   = number of unique components created (each in its own env)
-  # # length = seconds to wait per component for pre-compilation warm-up
-  # - name: cold-start-unknown-small
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 25, 50]
-  #   length: [2]
-  #   disableCompilationCache: true
+  # Cold-start: compilation cache disabled — measures true cold-start latency
+  # with no warm compiled artefact available.
+  # size   = number of unique components created (each in its own env)
+  # length = seconds to wait per component for pre-compilation warm-up
+  - name: cold-start-unknown-small
+    iterations: 1
+    clusterSize: [2]
+    size: [1, 5, 25, 50]
+    length: [2]
+    disableCompilationCache: true
 
-  # - name: cold-start-unknown-medium
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 25, 50]
-  #   length: [5]
-  #   disableCompilationCache: true
+  - name: cold-start-unknown-medium
+    iterations: 1
+    clusterSize: [2]
+    size: [1, 5, 25, 50]
+    length: [5]
+    disableCompilationCache: true
 
-  # # Cold-start: compilation cache enabled — measures latency once the compiled
-  # # artefact is available in the cache.
-  # # size   = number of unique components created (each in its own env)
-  # # length = seconds to wait per component for pre-compilation warm-up
-  # # NOTE: if results here are close to the cache-disabled entries above, the
-  # # warm-up wait is too short and compilation hasn't finished — bump length.
-  # - name: cold-start-unknown-small
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 25, 50]
-  #   length: [2]
+  # Cold-start: compilation cache enabled — measures latency once the compiled
+  # artefact is available in the cache.
+  # size   = number of unique components created (each in its own env)
+  # length = seconds to wait per component for pre-compilation warm-up
+  # NOTE: if results here are close to the cache-disabled entries above, the
+  # warm-up wait is too short and compilation hasn't finished — bump length.
+  - name: cold-start-unknown-small
+    iterations: 1
+    clusterSize: [2]
+    size: [1, 5, 25, 50]
+    length: [2]
 
-  # - name: cold-start-unknown-medium
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [1, 5, 10, 25, 50]
-  #   length: [5]
+  - name: cold-start-unknown-medium
+    iterations: 1
+    clusterSize: [2]
+    size: [1, 5, 25, 50]
+    length: [5]
 
-  # # Invocation latency — hot and cold paths through the Gateway NLB.
-  # # Large worker counts to stress the load balancer and connection pool.
-  # # size   = number of workers created
-  # # length = number of hot invocations per worker after the first cold one
-  # - name: latency-small
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [100, 500, 1000, 2000, 5000]
-  #   length: [2]
+  # Invocation latency — hot and cold paths through the Gateway NLB.
+  # Large worker counts to stress the load balancer and connection pool.
+  # size   = number of workers created
+  # length = number of hot invocations per worker after the first cold one
+  - name: latency-small
+    iterations: 1
+    clusterSize: [2]
+    size: [100, 500, 1000, 2000, 5000]
+    length: [2]
 
-  # - name: latency-medium
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [100, 500, 1000, 2000]
-  #   length: [5]
+  - name: latency-medium
+    iterations: 1
+    clusterSize: [2]
+    size: [100, 500, 1000, 2000]
+    length: [5]
 
-  # # Sleep — measures worker suspension and resumption under real network
-  # # conditions. High residency: all `size` workers held in memory sleeping at
-  # # once, so this also probes how many resident workers fit (memory-admission
-  # # relevant) — pushed past the ~2000 echo proved out.
-  # # size   = number of workers launched in parallel
-  # # length = sleep duration in milliseconds
-  # - name: sleep
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [10, 100, 500, 1000, 2000]
-  #   length: [10000]
+  # Sleep — measures worker suspension and resumption under real network
+  # conditions. High residency: all `size` workers held in memory sleeping at
+  # once, so this also probes how many resident workers fit (memory-admission
+  # relevant) — pushed past the ~2000 echo proved out.
+  # size   = number of workers launched in parallel
+  # length = sleep duration in milliseconds
+  - name: sleep
+    iterations: 1
+    clusterSize: [2]
+    size: [10, 100, 500, 1000, 2000]
+    length: [10000]
 
-  # # Durability overhead — measures the cost of durable vs ephemeral execution
-  # # across four variants (durable-persistent, durable-non-persistent,
-  # # ephemeral, durable-persistent-commit). size workers concurrent per phase;
-  # # sized up to put real load on the oplog/persistence/storage path.
-  # # size   = number of workers per variant
-  # # length = loop iteration count passed to oplog_heavy
-  # - name: durability-overhead
-  #   iterations: 3
-  #   clusterSize: [2]
-  #   size: [10, 50, 100, 250]
-  #   length: [5000]
+  # Durability overhead — measures the cost of durable vs ephemeral execution
+  # across four variants (durable-persistent, durable-non-persistent,
+  # ephemeral, durable-persistent-commit). size workers concurrent per phase;
+  # sized up to put real load on the oplog/persistence/storage path.
+  # size   = number of workers per variant
+  # length = loop iteration count passed to oplog_heavy
+  - name: durability-overhead
+    iterations: 1
+    clusterSize: [2]
+    size: [10, 50, 100, 250]
+    length: [5000]

From a8fcf52168155fa2d3f33291ecbf86727a348087 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Wed, 10 Jun 2026 22:43:04 -0700
Subject: [PATCH 42/60] feat: more metrics plus FixedProbe for tests

---
 golem-worker-executor-test-utils/src/lib.rs   | 44 ++++++++++++++++---
 golem-worker-executor/src/lib.rs              | 17 +++++--
 golem-worker-executor/src/metrics.rs          | 18 ++++++++
 .../services/active_workers/memory_probe.rs   | 32 ++++++++++++++
 .../src/services/active_workers/mod.rs        | 14 +++++-
 golem-worker-executor/src/worker/mod.rs       |  1 +
 6 files changed, 116 insertions(+), 10 deletions(-)

diff --git a/golem-worker-executor-test-utils/src/lib.rs b/golem-worker-executor-test-utils/src/lib.rs
index fcfb661670..6d86ffb7a7 100644
--- a/golem-worker-executor-test-utils/src/lib.rs
+++ b/golem-worker-executor-test-utils/src/lib.rs
@@ -82,6 +82,7 @@ use golem_worker_executor::preview2::golem::agent::host::{
 };
 use golem_worker_executor::preview2::{golem_api_1_x, golem_durability};
 use golem_worker_executor::services::active_workers::ActiveWorkers;
+use golem_worker_executor::services::active_workers::memory_probe::FixedProbe;
 use golem_worker_executor::services::agent_types::AgentTypesService;
 use golem_worker_executor::services::agent_webhooks::AgentWebhooksService;
 use golem_worker_executor::services::blob_store::{
@@ -536,8 +537,9 @@ fn make_base_test_config(deps: &WorkerExecutorTestDependencies) -> GolemConfig {
         // The measured-headroom admission gate requires the executor to own its
         // memory environment (cgroup/process). The in-process test harness runs
         // the executor alongside the test framework and other services, so the
-        // probe cannot isolate this executor's footprint — disable it and gate on
-        // the estimate semaphore alone, matching pre-gate behaviour.
+        // probe cannot isolate this executor's footprint. Disable the gate so
+        // admission always proceeds and tests are not subject to a memory limit
+        // derived from the shared host.
         memory: MemoryConfig {
             enable_measured_admission: false,
             ..Default::default()
@@ -705,9 +707,16 @@ pub async fn start_customized(
     apply_sqlite_storage_config(&mut config, deps, context);
     config.memory = MemoryConfig {
         system_memory_override,
-        // Measured admission disabled in the shared in-process test harness; the
-        // small system_memory_override here drives the estimate semaphore alone.
-        enable_measured_admission: false,
+        // Enable the measured-headroom gate when a test pins a memory limit, so
+        // memory-pressure tests exercise the real admission controller under that
+        // limit. The test bootstrap (create_active_workers) feeds the gate a
+        // fixed probe reporting this limit with zero current usage, so admission
+        // is decided on the granted accounting against the pinned limit and is
+        // not perturbed by the shared test process's RSS. Otherwise the gate is
+        // disabled (see make_base_test_config). The usable ratio
+        // (worker_memory_ratio, default 0.8) applies, matching the pre-gate
+        // semaphore pool size of system_memory_override * ratio.
+        enable_measured_admission: system_memory_override.is_some(),
         ..Default::default()
     };
     config.filesystem_storage = FilesystemStorageConfig {
@@ -1370,6 +1379,31 @@ impl InvocationContextManagement for TestWorkerCtx {
 
 #[async_trait]
 impl Bootstrap<TestWorkerCtx> for TestServerBootstrap {
+    fn create_active_workers(
+        &self,
+        golem_config: &GolemConfig,
+    ) -> Arc<ActiveWorkers<TestWorkerCtx>> {
+        // The in-process test harness shares its process (and RSS) with the test
+        // framework and other services, so a process-RSS probe cannot isolate
+        // this executor's footprint. When a test pins a memory limit via
+        // system_memory_override, give the gate a fixed probe reporting that
+        // limit with zero current usage, so admission is decided solely on the
+        // granted accounting (exact and process-isolated) against the pinned
+        // limit. The usable_ratio (worker_memory_ratio) still applies, matching
+        // the pre-gate semaphore pool size of system_memory_override * ratio.
+        match golem_config.memory.system_memory_override {
+            Some(limit) => Arc::new(ActiveWorkers::new_with_probe(
+                Box::new(FixedProbe::new(limit, 0)),
+                &golem_config.memory,
+                &golem_config.filesystem_storage,
+            )),
+            None => Arc::new(ActiveWorkers::new(
+                &golem_config.memory,
+                &golem_config.filesystem_storage,
+            )),
+        }
+    }
+
     fn create_shard_manager_service(
         &self,
         _shard_manager_client: Arc<dyn golem_service_base::clients::shard_manager::ShardManager>,
diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs
index f2df280bff..8b05e6e71c 100644
--- a/golem-worker-executor/src/lib.rs
+++ b/golem-worker-executor/src/lib.rs
@@ -161,6 +161,18 @@ impl Drop for RunDetails {
 #[async_trait]
 #[allow(clippy::too_many_arguments)]
 pub trait Bootstrap<Ctx: WorkerCtx> {
+    /// Creates the [`ActiveWorkers`] service, including the measured-headroom
+    /// admission gate. The default builds the memory probe from the config
+    /// (cgroup/process/override). The in-process test harness overrides this to
+    /// inject a probe with a pinned limit and usage so the gate is deterministic
+    /// and isolated from the shared test process's RSS.
+    fn create_active_workers(&self, golem_config: &GolemConfig) -> Arc<ActiveWorkers<Ctx>> {
+        Arc::new(ActiveWorkers::<Ctx>::new(
+            &golem_config.memory,
+            &golem_config.filesystem_storage,
+        ))
+    }
+
     fn create_shard_manager_service(
         &self,
         shard_manager_client: Arc<dyn golem_service_base::clients::shard_manager::ShardManager>,
@@ -769,10 +781,7 @@ pub async fn create_worker_executor_impl<
         }
     };
 
-    let active_workers = Arc::new(ActiveWorkers::<Ctx>::new(
-        &golem_config.memory,
-        &golem_config.filesystem_storage,
-    ));
+    let active_workers = bootstrap.create_active_workers(&golem_config);
 
     let file_loader = Arc::new(FileLoader::new(
         initial_files_service.clone(),
diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs
index b611f9985b..0e009c0705 100644
--- a/golem-worker-executor/src/metrics.rs
+++ b/golem-worker-executor/src/metrics.rs
@@ -208,6 +208,21 @@ pub mod workers {
             &["executor_id"]
         )
         .unwrap();
+        pub static ref WORKER_MEMORY_GROW_REJECTED_TOTAL: CounterVec = register_counter_vec!(
+            "golem_worker_memory_grow_rejected_total",
+            "Invocations interrupted because a worker's linear-memory grow could not be admitted by the gate (out-of-memory trap, retried via reacquire)",
+            &["executor_id"]
+        )
+        .unwrap();
+    }
+
+    /// Counts one invocation interrupted because a linear-memory grow was
+    /// refused by the admission gate (the worker traps out-of-memory and is
+    /// restarted to reacquire memory).
+    pub fn record_worker_memory_grow_rejected() {
+        WORKER_MEMORY_GROW_REJECTED_TOTAL
+            .with_label_values(&[crate::metrics::storage::executor_id()])
+            .inc();
     }
 
     /// Sets the gate's usable memory ceiling gauge.
@@ -269,6 +284,9 @@ pub mod workers {
         WORKER_WAITING_FOR_MEMORY_COUNT
             .with_label_values(&[id])
             .set(0.0);
+        WORKER_MEMORY_GROW_REJECTED_TOTAL
+            .with_label_values(&[id])
+            .inc_by(0.0);
     }
 
     pub fn inc_worker_memory_resident() {
diff --git a/golem-worker-executor/src/services/active_workers/memory_probe.rs b/golem-worker-executor/src/services/active_workers/memory_probe.rs
index 6a26b3dd25..6940b53db4 100644
--- a/golem-worker-executor/src/services/active_workers/memory_probe.rs
+++ b/golem-worker-executor/src/services/active_workers/memory_probe.rs
@@ -100,6 +100,38 @@ impl MemoryProbe for ProcessRssProbe {
     }
 }
 
+/// A probe with a fixed limit and a fixed current usage, both set at
+/// construction. Reports the same snapshot on every call regardless of the
+/// host. Used by the in-process test harness, where the executor shares its
+/// process (and therefore its real RSS) with the test framework and other
+/// services, so a process-RSS probe cannot isolate this executor's footprint.
+/// Pinning `current_bytes` to a known value (typically 0) makes the gate decide
+/// purely on the granted accounting against the pinned limit, which is exact and
+/// process-isolated, so memory-pressure tests are deterministic.
+#[derive(Debug)]
+pub struct FixedProbe {
+    limit_bytes: u64,
+    current_bytes: u64,
+}
+
+impl FixedProbe {
+    pub fn new(limit_bytes: u64, current_bytes: u64) -> Self {
+        Self {
+            limit_bytes,
+            current_bytes,
+        }
+    }
+}
+
+impl MemoryProbe for FixedProbe {
+    fn snapshot(&self) -> MemorySnapshot {
+        MemorySnapshot {
+            limit_bytes: self.limit_bytes,
+            current_bytes: self.current_bytes,
+        }
+    }
+}
+
 /// Linux cgroup v2 probe. Reads `memory.max` and `memory.current` from the
 /// process's cgroup.
 #[cfg(target_os = "linux")]
diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs
index b428674a07..4ac70f8744 100644
--- a/golem-worker-executor/src/services/active_workers/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/mod.rs
@@ -33,7 +33,7 @@ use admission::{AdmissionController, AdmissionDecision, EvictionPriority, Evicti
 use async_trait::async_trait;
 pub use component_charge::HeldComponentCharge;
 use component_charge::{ChargeSource, ComponentChargeGuard, ComponentChargeRegistry};
-use memory_probe::default_probe;
+use memory_probe::{MemoryProbe, default_probe};
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -102,6 +102,18 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
         // bases its decision on the pod's cgroup limit when constrained (not host
         // RAM).
         let probe = default_probe(memory_config.system_memory_override);
+        Self::new_with_probe(probe, memory_config, storage_config)
+    }
+
+    /// Like [`Self::new`] but with an explicitly provided memory probe instead of
+    /// the one derived from the config. The in-process test harness uses this to
+    /// supply a probe with a pinned limit and current usage, so the gate's
+    /// decision is deterministic and isolated from the shared test process's RSS.
+    pub fn new_with_probe(
+        probe: Box<dyn MemoryProbe>,
+        memory_config: &MemoryConfig,
+        storage_config: &FilesystemStorageConfig,
+    ) -> Self {
         let admission = memory_config.enable_measured_admission.then(|| {
             Arc::new(AdmissionController::new(
                 probe,
diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs
index bd1ead3243..69dd1c769f 100644
--- a/golem-worker-executor/src/worker/mod.rs
+++ b/golem-worker-executor/src/worker/mod.rs
@@ -1008,6 +1008,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             self.granted_memory.fetch_add(delta, Ordering::Relaxed);
             Ok(())
         } else {
+            crate::metrics::workers::record_worker_memory_grow_rejected();
             Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory))
         }
     }

From 7eb6f08fcaf80d02a1d1134d2c81b908d58a5f66 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Thu, 11 Jun 2026 01:12:43 -0700
Subject: [PATCH 43/60] fix: make admission gate reserve atomic to prevent
 ceiling overshoot

---
 .../services/active_workers/admission/mod.rs  | 37 ++++++--
 .../active_workers/admission/tests.rs         | 85 +++++++++++++++++++
 2 files changed, 115 insertions(+), 7 deletions(-)

diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs
index 0008f66773..e9ca7f7079 100644
--- a/golem-worker-executor/src/services/active_workers/admission/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs
@@ -134,14 +134,38 @@ impl AdmissionController {
     /// not cover), so taking the maximum keeps the gate safe against both the
     /// grant a worker may yet fault in and any usage the grant does not capture.
     fn admissible_headroom(&self) -> u64 {
+        let granted = *self.granted.lock().unwrap();
+        self.headroom_with_granted(granted)
+    }
+
+    /// Computes admissible headroom for an already-read `granted` value. Reads
+    /// the probe and emits the ceiling/RSS metrics. Kept separate from the lock
+    /// acquisition so the decision-and-reserve sequence can hold the lock across
+    /// both steps (see [`Self::try_reserve_locked`]).
+    fn headroom_with_granted(&self, granted: u64) -> u64 {
         let snapshot = self.probe.snapshot();
         let ceiling = (snapshot.limit_bytes as f64 * self.policy.usable_ratio) as u64;
-        let granted = *self.granted.lock().unwrap();
         crate::metrics::workers::record_worker_memory_ceiling(ceiling);
         crate::metrics::workers::record_worker_admission_rss(snapshot.current_bytes);
         ceiling.saturating_sub(snapshot.current_bytes.max(granted))
     }
 
+    /// Atomically admits `request_bytes` if the headroom computed against the
+    /// current granted total covers it: reads `granted`, computes headroom, and
+    /// adds the reservation all under one lock so two concurrent admissions
+    /// cannot both pass the check against the same headroom and overshoot the
+    /// ceiling. Returns whether the request was admitted.
+    fn try_reserve_locked(&self, request_bytes: u64) -> bool {
+        let mut granted = self.granted.lock().unwrap();
+        if self.headroom_with_granted(*granted) >= request_bytes {
+            *granted += request_bytes;
+            crate::metrics::workers::record_worker_memory_granted(*granted);
+            true
+        } else {
+            false
+        }
+    }
+
     /// Record `request_bytes` of memory granted to a newly admitted worker. The
     /// gate reserves this until the worker unloads, because the worker may fault
     /// the granted pages in at any later time.
@@ -192,9 +216,8 @@ impl AdmissionController {
         request_bytes: u64,
         source: &dyn EvictionSource,
     ) -> AdmissionDecision {
-        // Fast path: enough real headroom already, admit without evicting.
-        if self.admissible_headroom() >= request_bytes {
-            self.reserve(request_bytes);
+        // Fast path: atomically admit if there is already enough real headroom.
+        if self.try_reserve_locked(request_bytes) {
             return AdmissionDecision::Admit;
         }
 
@@ -212,9 +235,9 @@ impl AdmissionController {
 
         // Re-measure against ground truth rather than trusting the freed tally:
         // the probe is the authority, and other activity may have moved usage
-        // in either direction while we were evicting.
-        if self.admissible_headroom() >= request_bytes {
-            self.reserve(request_bytes);
+        // in either direction while we were evicting. The check-and-reserve is
+        // atomic so a concurrent admission cannot slip in between.
+        if self.try_reserve_locked(request_bytes) {
             AdmissionDecision::Admit
         } else {
             AdmissionDecision::Reject
diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs
index 24e9b3e119..6f263930b3 100644
--- a/golem-worker-executor/src/services/active_workers/admission/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs
@@ -254,6 +254,91 @@ async fn apply_staggered_admit(
     decision
 }
 
+/// A probe with a fixed limit that always reports zero current usage, so the
+/// gate's admission decision is driven solely by the granted accounting against
+/// the ceiling. Used by the concurrency test, where the property under test is
+/// that the granted counter cannot be over-committed by racing admissions.
+#[derive(Debug)]
+struct ZeroUsageProbe {
+    limit: u64,
+}
+
+impl MemoryProbe for ZeroUsageProbe {
+    fn snapshot(&self) -> MemorySnapshot {
+        MemorySnapshot {
+            limit_bytes: self.limit,
+            current_bytes: 0,
+        }
+    }
+}
+
+/// An eviction source with nothing to evict: a rejected request stays rejected.
+struct NoEvictionSource;
+
+#[async_trait::async_trait]
+impl EvictionSource for NoEvictionSource {
+    async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 {
+        0
+    }
+}
+
+/// Concurrent admissions must never grant more than the ceiling allows.
+///
+/// Many admit attempts of equal size race against a controller whose ceiling
+/// admits only a known number of them, with no evictable work to fall back on.
+/// Exactly `ceiling / request` requests must be admitted and the rest rejected;
+/// the total granted must never exceed the ceiling. This can only hold if each
+/// admission's "is there room? then reserve" sequence is atomic against the
+/// others — if two admits read the same headroom before either reserves, both
+/// pass and the granted total overshoots the ceiling.
+#[test]
+fn concurrent_admissions_never_overcommit_the_ceiling() {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(8)
+        .build()
+        .unwrap();
+
+    rt.block_on(async {
+        const REQUEST: u64 = 10;
+        const CAPACITY: u64 = 50; // exactly 5 requests fit
+        const ATTEMPTS: usize = 200; // far more than fit, all racing
+
+        let controller = Arc::new(AdmissionController::new(
+            Box::new(ZeroUsageProbe { limit: CAPACITY }),
+            AdmissionPolicy { usable_ratio: 1.0 },
+        ));
+
+        let mut handles = Vec::with_capacity(ATTEMPTS);
+        for _ in 0..ATTEMPTS {
+            let controller = controller.clone();
+            handles.push(tokio::spawn(async move {
+                controller.try_admit(REQUEST, &NoEvictionSource).await
+            }));
+        }
+
+        let mut admitted = 0usize;
+        for handle in handles {
+            if handle.await.unwrap() == AdmissionDecision::Admit {
+                admitted += 1;
+            }
+        }
+
+        let expected = (CAPACITY / REQUEST) as usize;
+        assert_eq!(
+            admitted, expected,
+            "expected exactly {expected} admissions to fit, got {admitted}"
+        );
+        // With zero measured usage, headroom is the ceiling minus granted; if it
+        // equals the full ceiling again, everything admitted was released, which
+        // never happens here. The decisive check: the admitted total fits.
+        assert!(
+            admitted as u64 * REQUEST <= CAPACITY,
+            "granted {} exceeded ceiling {CAPACITY}",
+            admitted as u64 * REQUEST
+        );
+    });
+}
+
 // ── Single-case unit tests ───────────────────────────────────────────────────
 
 #[test]

From 83a6b2f071cfc518935b8e890a8b49f951e1ced6 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Thu, 11 Jun 2026 02:04:43 -0700
Subject: [PATCH 44/60] test: gate concurrent-agent permit tests with a
 semaphore, not Notify

---
 .../tests/resource_limits.rs                  | 35 ++++++++++++++-----
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/golem-worker-executor/tests/resource_limits.rs b/golem-worker-executor/tests/resource_limits.rs
index 58377cba3b..a816beb39e 100644
--- a/golem-worker-executor/tests/resource_limits.rs
+++ b/golem-worker-executor/tests/resource_limits.rs
@@ -186,11 +186,14 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish(
     let context = TestContext::new(last_unique_id);
     let executor = start_with_concurrent_agent_limit(deps, &context, 1).await?;
 
-    // HTTP server that gates its /poll response behind a Notify.
+    // HTTP server that gates its /poll response behind a zero-permit semaphore.
     // HttpClient2.start_polling polls GET /poll until the body equals "done".
-    // By holding the Notify unreleased we keep a1 in the Running state
-    // for as long as needed, preventing eviction and holding the only permit.
-    let gate = std::sync::Arc::new(tokio::sync::Notify::new());
+    // The handler blocks acquiring a permit, so by withholding the permit we keep
+    // a1 in the Running state for as long as needed, preventing eviction and
+    // holding the only permit. A semaphore is used rather than a Notify so the
+    // release is not sensitive to whether the request's waiter is registered
+    // before the release call.
+    let gate = std::sync::Arc::new(tokio::sync::Semaphore::new(0));
     let gate_clone = gate.clone();
     let listener = tokio::net::TcpListener::bind("0.0.0.0:0").await?;
     let port = listener.local_addr()?.port();
@@ -200,7 +203,10 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish(
             get(move || {
                 let gate = gate_clone.clone();
                 async move {
-                    gate.notified().await;
+                    gate.acquire()
+                        .await
+                        .expect("gate semaphore closed")
+                        .forget();
                     "done".to_string()
                 }
             }),
@@ -259,7 +265,7 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish(
     // Release the gate — a1's poll loop returns "done", its invocation
     // completes, and its permit is returned to the semaphore via Drop.
     // This unblocks a2 from WaitingForPermit.
-    gate.notify_waiters();
+    gate.add_permits(1);
 
     // Wait for a1 to become Idle (invocation done, permit released).
     executor
@@ -320,7 +326,13 @@ async fn concurrent_agent_idle_releases_permit(
     let executor = start_with_concurrent_agent_limit(deps, &context, 1).await?;
 
     // --- HTTP gate: keeps a1 provably Running until we release it. ---
-    let gate = std::sync::Arc::new(tokio::sync::Notify::new());
+    // A zero-permit semaphore is used rather than a Notify so the release is not
+    // sensitive to whether the request's waiter is registered before the release
+    // call: a permit added before the handler reaches `acquire` is simply waiting
+    // for it. The handler blocks on `acquire` and only returns once the test adds
+    // a permit, so a1 stays Running (blocked in /poll) until then regardless of
+    // how the runner schedules the tasks.
+    let gate = std::sync::Arc::new(tokio::sync::Semaphore::new(0));
     let gate_clone = gate.clone();
     let listener = tokio::net::TcpListener::bind("0.0.0.0:0").await?;
     let port = listener.local_addr()?.port();
@@ -330,7 +342,12 @@ async fn concurrent_agent_idle_releases_permit(
             get(move || {
                 let gate = gate_clone.clone();
                 async move {
-                    gate.notified().await;
+                    // Consume one permit permanently so a single added permit
+                    // releases exactly one poll, not a recycled one.
+                    gate.acquire()
+                        .await
+                        .expect("gate semaphore closed")
+                        .forget();
                     "done".to_string()
                 }
             }),
@@ -387,7 +404,7 @@ async fn concurrent_agent_idle_releases_permit(
     // Release the gate. a1's poll returns "done", invocation completes, a1 goes Idle.
     // With the fix: Idle transition drops the permit → semaphore notifies a2 → a2 starts.
     // With the bug: a1 stays Idle but holds permit → a2 remains blocked forever.
-    gate.notify_waiters();
+    gate.add_permits(1);
 
     // a2 should now be unblocked (fix) or remain stuck (bug).
     // Give it 15 seconds — well beyond what starting a counter agent takes.

From 24673f6f6975af7abe4dfc9d3970122f19e57e59 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Thu, 11 Jun 2026 15:53:07 -0700
Subject: [PATCH 45/60] feat: expose tokio metrics

---
 golem-worker-executor/src/lib.rs     |  4 ++
 golem-worker-executor/src/metrics.rs | 93 ++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs
index 8b05e6e71c..377123ddf7 100644
--- a/golem-worker-executor/src/lib.rs
+++ b/golem-worker-executor/src/lib.rs
@@ -1061,6 +1061,10 @@ pub async fn bootstrap_and_run_worker_executor<
 
     let leak_detector = worker_executor_impl.leak_detector();
 
+    join_set.spawn(crate::metrics::runtime::run_runtime_metrics_loop(
+        runtime.clone(),
+    ));
+
     let grpc_port = run_grpc_server(worker_executor_impl, lazy_worker_activator, join_set).await?;
 
     let http_port = golem_service_base::observability::start_health_and_metrics_server(
diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs
index 0e009c0705..237c4cb0f4 100644
--- a/golem-worker-executor/src/metrics.rs
+++ b/golem-worker-executor/src/metrics.rs
@@ -113,6 +113,99 @@ pub mod component {
     }
 }
 
+pub mod runtime {
+    use std::time::{Duration, Instant};
+
+    use lazy_static::lazy_static;
+    use prometheus::*;
+    use tokio::runtime::Handle;
+
+    lazy_static! {
+        /// Number of tasks currently sitting in the tokio runtime's global
+        /// (injection) queue: runnable but not yet polled by any worker thread.
+        /// A persistently non-zero value means ready tasks (including I/O
+        /// continuations such as DB-response handling) are waiting for a worker
+        /// thread, which inflates I/O latency metrics even when the underlying
+        /// I/O is fast.
+        static ref GLOBAL_QUEUE_DEPTH: IntGauge = register_int_gauge!(
+            "executor_tokio_global_queue_depth",
+            "Tasks scheduled in the tokio runtime global queue, runnable but not yet polled"
+        )
+        .unwrap();
+
+        /// Number of worker threads in the multi-thread runtime.
+        static ref NUM_WORKERS: IntGauge = register_int_gauge!(
+            "executor_tokio_num_workers",
+            "Number of tokio runtime worker threads"
+        )
+        .unwrap();
+
+        /// Current number of alive tasks in the runtime.
+        static ref NUM_ALIVE_TASKS: IntGauge = register_int_gauge!(
+            "executor_tokio_num_alive_tasks",
+            "Number of alive tasks in the tokio runtime"
+        )
+        .unwrap();
+
+        /// Per-worker busy ratio over the last sampling interval: the fraction
+        /// of wall-clock time the worker spent executing tasks. A value near 1.0
+        /// means the worker is CPU-saturated and cannot promptly poll newly
+        /// ready tasks.
+        static ref WORKER_BUSY_RATIO: GaugeVec = register_gauge_vec!(
+            "executor_tokio_worker_busy_ratio",
+            "Fraction of wall-clock time each tokio worker spent busy over the sampling interval",
+            &["worker"]
+        )
+        .unwrap();
+    }
+
+    /// Background loop that samples stable tokio runtime metrics and exports them
+    /// to Prometheus.
+    ///
+    /// All metrics used here are stable as of tokio 1.45 (the workspace resolves
+    /// 1.50+), so this requires neither the `tokio_unstable` cfg nor any build
+    /// flag. `global_queue_depth` is the primary diagnostic for runtime
+    /// scheduling pressure; `worker_busy_ratio` corroborates it by showing
+    /// per-worker CPU saturation. Never returns.
+    pub async fn run_runtime_metrics_loop(handle: Handle) -> anyhow::Result<()> {
+        const INTERVAL: Duration = Duration::from_secs(5);
+
+        let metrics = handle.metrics();
+        let num_workers = metrics.num_workers();
+        NUM_WORKERS.set(num_workers as i64);
+
+        // Previous cumulative busy duration per worker, for computing the busy
+        // ratio over each interval.
+        let mut prev_busy: Vec<Duration> = (0..num_workers)
+            .map(|w| metrics.worker_total_busy_duration(w))
+            .collect();
+        let mut prev_instant = Instant::now();
+
+        let mut interval = tokio::time::interval(INTERVAL);
+        interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+        loop {
+            interval.tick().await;
+
+            GLOBAL_QUEUE_DEPTH.set(metrics.global_queue_depth() as i64);
+            NUM_ALIVE_TASKS.set(metrics.num_alive_tasks() as i64);
+
+            let now = Instant::now();
+            let elapsed = now.duration_since(prev_instant).as_secs_f64();
+            prev_instant = now;
+            if elapsed > 0.0 {
+                for (w, prev) in prev_busy.iter_mut().enumerate() {
+                    let busy = metrics.worker_total_busy_duration(w);
+                    let delta = busy.saturating_sub(*prev).as_secs_f64();
+                    *prev = busy;
+                    WORKER_BUSY_RATIO
+                        .with_label_values(&[&w.to_string()])
+                        .set((delta / elapsed).min(1.0));
+                }
+            }
+        }
+    }
+}
+
 pub mod events {
     use lazy_static::lazy_static;
     use prometheus::*;

From a1928c56311def1098c72101c54706999b2894c8 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Thu, 11 Jun 2026 20:40:24 -0700
Subject: [PATCH 46/60] fix: prevent concurrent-agent scheduler deadlock on
 cancel-after-grant

---
 .../services/active_workers/tests.txt         |   7 +
 .../concurrent_agents_scheduler.rs            | 148 ++++++++----
 .../src/services/active_workers/tests.rs      | 220 ++++++++++++++++++
 3 files changed, 330 insertions(+), 45 deletions(-)
 create mode 100644 golem-worker-executor/proptest-regressions/services/active_workers/tests.txt

diff --git a/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt b/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt
new file mode 100644
index 0000000000..5845bf0e72
--- /dev/null
+++ b/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt
@@ -0,0 +1,7 @@
+# Seeds for failure cases proptest has generated in the past. It is
+# automatically read and these particular cases re-run before any
+# novel cases are generated.
+#
+# It is recommended to check this file in to source control so that
+# everyone who runs the test benefits from these saved cases.
+cc 25407766c98e9d718173e44b5321f97049eea6d6d7737aad80a937d7230d67d9 # shrinks to limit = 1, ops = [Acquire, Acquire, CancelPending(Index(423873604949)), Acquire, ReleaseThenCancel(Index(2899867607303593255), Index(13233034632676646474))]
diff --git a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs
index 77c3f74b86..2391fce697 100644
--- a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs
+++ b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs
@@ -48,42 +48,95 @@ struct AccountSchedulerState {
 
 struct QueuedAgent {
     agent_id: AgentId,
-    waker: tokio::sync::oneshot::Sender<OwnedSemaphorePermit>,
+    waker: tokio::sync::oneshot::Sender<GrantedSlot>,
 }
 
-/// RAII permit returned by [`ConcurrentAgentsScheduler::acquire`].
+/// A slot granted from the scheduler: owns the underlying semaphore permit and
+/// the responsibility to decrement the account's `running_count` and wake the
+/// next queued agent when it is released.
 ///
-/// On drop, decrements the account's running count and wakes the next queued
-/// agent (if any). The drop handler is fully synchronous.
-pub struct ConcurrentAgentPermit {
+/// Crucially, the `running_count` was incremented *together with* acquiring the
+/// raw permit, and the matching decrement lives **only** here in `Drop`. This
+/// binds the count strictly to the lifetime of the granted permit, regardless
+/// of how the slot is ultimately disposed of:
+///
+/// * It is moved into a [`ConcurrentAgentPermit`] and dropped when the agent
+///   releases the slot (the normal case), or
+/// * it is sent into a queued waiter's oneshot and that waiter is cancelled
+///   before receiving it — the slot is then dropped *inside* the channel.
+///
+/// Both paths run this same `Drop`, so a slot granted to a waiter that is
+/// cancelled after the grant succeeded cannot leak the count. (A previous
+/// design decremented only when the oneshot `send` failed, which left
+/// `running_count` permanently inflated when a waiter was cancelled *after* a
+/// successful send — wedging the whole account once the count reached the
+/// limit.)
+struct GrantedSlot {
     raw: Option<OwnedSemaphorePermit>,
-    account: Option<Arc<AccountScheduler>>,
+    account: Arc<AccountScheduler>,
     account_id: AccountId,
 }
 
-impl Drop for ConcurrentAgentPermit {
+impl Drop for GrantedSlot {
     fn drop(&mut self) {
         if let Some(raw) = self.raw.take() {
             // Return the raw permit to the semaphore first so it is available
             // for the next queued agent's synchronous try-acquire.
             drop(raw);
-
-            if let Some(ref account) = self.account {
-                try_grant_next_sync(account, &self.account_id);
-            }
+            try_grant_next_sync(&self.account, &self.account_id);
         }
     }
 }
 
-impl ConcurrentAgentPermit {
-    /// Consumes the permit without triggering the drop notification.
-    #[allow(dead_code)]
-    pub fn into_inner(mut self) -> Option<OwnedSemaphorePermit> {
-        self.account = None;
+impl GrantedSlot {
+    /// Take the raw permit out, suppressing this slot's `Drop` bookkeeping.
+    ///
+    /// Used only from `drain_ready_queue` when a `send` to a cancelled waiter
+    /// fails: the slot is returned in the `Err`, but we are still holding the
+    /// account state lock, so letting its `Drop` run would re-enter
+    /// `try_grant_next_sync` and deadlock on the same non-reentrant mutex. The
+    /// caller takes the permit back and performs the accounting inline instead.
+    fn defuse(mut self) -> Option<OwnedSemaphorePermit> {
         self.raw.take()
     }
 }
 
+/// RAII permit returned by [`ConcurrentAgentsScheduler::acquire`].
+///
+/// On drop, decrements the account's running count and wakes the next queued
+/// agent (if any) via the held [`GrantedSlot`]. Unlimited accounts hold a bare
+/// permit with no slot, so dropping them touches no scheduler accounting. The
+/// drop handler is fully synchronous.
+pub struct ConcurrentAgentPermit {
+    /// `Some` for limited accounts (carries the scheduler accounting); `None`
+    /// for unlimited accounts, where `_raw` holds the bare bypass permit. Held
+    /// purely for its `Drop`, which returns the permit and wakes the next
+    /// queued agent.
+    _slot: Option<GrantedSlot>,
+    /// Bare permit for the unlimited-account bypass path. Unused for limited
+    /// accounts (the permit lives inside `_slot`).
+    _raw: Option<OwnedSemaphorePermit>,
+}
+
+impl ConcurrentAgentPermit {
+    /// A permit for a limited account, carrying the scheduler accounting.
+    fn from_slot(slot: GrantedSlot) -> Self {
+        Self {
+            _slot: Some(slot),
+            _raw: None,
+        }
+    }
+
+    /// A permit for an unlimited account: a bare bypass permit with no
+    /// scheduler accounting.
+    fn unlimited(raw: OwnedSemaphorePermit) -> Self {
+        Self {
+            _slot: None,
+            _raw: Some(raw),
+        }
+    }
+}
+
 impl Default for ConcurrentAgentsScheduler {
     fn default() -> Self {
         Self::new()
@@ -156,11 +209,7 @@ impl ConcurrentAgentsScheduler {
         // Unlimited accounts bypass the queue entirely.
         if is_unlimited(limit) {
             let raw = self.permits.acquire(account_id, || async { false }).await;
-            return ConcurrentAgentPermit {
-                raw: Some(raw),
-                account: None,
-                account_id,
-            };
+            return ConcurrentAgentPermit::unlimited(raw);
         }
 
         // Sync the underlying semaphore pool size with the current plan limit
@@ -175,16 +224,12 @@ impl ConcurrentAgentsScheduler {
         let limit = account.resource_entry.max_concurrent_agents_per_executor();
         if is_unlimited(limit) {
             let raw = self.permits.acquire(account_id, || async { false }).await;
-            return ConcurrentAgentPermit {
-                raw: Some(raw),
-                account: None,
-                account_id,
-            };
+            return ConcurrentAgentPermit::unlimited(raw);
         }
 
         enum AcquireDecision {
             FastPath(OwnedSemaphorePermit),
-            Queued(tokio::sync::oneshot::Receiver<OwnedSemaphorePermit>),
+            Queued(tokio::sync::oneshot::Receiver<GrantedSlot>),
         }
 
         let decision = {
@@ -197,7 +242,7 @@ impl ConcurrentAgentsScheduler {
             // After a plan upgrade, newly added semaphore permits may allow
             // queued agents to proceed. Drain what we can before deciding
             // about the current agent.
-            drain_ready_queue(&mut state, &account.raw_semaphore, limit, &account_id);
+            drain_ready_queue(&mut state, &account, limit, &account_id);
 
             // Fast path: capacity available, no older waiters, and the raw
             // semaphore actually has a permit. We try-acquire the semaphore
@@ -239,26 +284,22 @@ impl ConcurrentAgentsScheduler {
                     "ConcurrentAgentsScheduler: fast-path permit for {agent_id} in account {account_id}"
                 );
 
-                ConcurrentAgentPermit {
+                ConcurrentAgentPermit::from_slot(GrantedSlot {
                     raw: Some(raw),
-                    account: Some(account),
+                    account,
                     account_id,
-                }
+                })
             }
             AcquireDecision::Queued(rx) => {
                 debug!(
                     "ConcurrentAgentsScheduler: {agent_id} queued in account {account_id}, waiting for permit"
                 );
 
-                let raw = rx.await.expect(
+                let slot = rx.await.expect(
                     "ConcurrentAgentsScheduler: oneshot sender dropped without sending — scheduler bug",
                 );
 
-                ConcurrentAgentPermit {
-                    raw: Some(raw),
-                    account: Some(account),
-                    account_id,
-                }
+                ConcurrentAgentPermit::from_slot(slot)
             }
         }
     }
@@ -299,7 +340,7 @@ impl ConcurrentAgentsScheduler {
 /// be fully synchronous. Uses `tokio::sync::Semaphore::try_acquire_owned`
 /// (which is synchronous despite being on a tokio type) to acquire permits
 /// for queued agents.
-fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) {
+fn try_grant_next_sync(account: &Arc<AccountScheduler>, account_id: &AccountId) {
     let limit = account.resource_entry.max_concurrent_agents_per_executor();
     if is_unlimited(limit) {
         return;
@@ -308,7 +349,7 @@ fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) {
     let mut state = account.state.lock().unwrap();
     state.running_count = state.running_count.saturating_sub(1);
 
-    drain_ready_queue(&mut state, &account.raw_semaphore, limit, account_id);
+    drain_ready_queue(&mut state, account, limit, account_id);
 }
 
 /// Try to grant permits to queued agents from the front of the ready queue.
@@ -316,9 +357,15 @@ fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) {
 /// Called both from `try_grant_next_sync` (Drop path) and from `acquire`
 /// (after a plan-upgrade sync adds new permits). Fully synchronous — only
 /// uses `try_acquire_owned` which does not block.
+///
+/// Each granted permit is wrapped in a [`GrantedSlot`] carrying the
+/// `running_count` decrement, so a waiter cancelled *after* a successful send
+/// still releases its slot (via the slot's `Drop` when the oneshot channel is
+/// dropped) rather than leaking the count. The increment here is matched
+/// one-for-one by that slot's `Drop`.
 fn drain_ready_queue(
     state: &mut AccountSchedulerState,
-    raw_semaphore: &Arc<tokio::sync::Semaphore>,
+    account: &Arc<AccountScheduler>,
     limit: u64,
     account_id: &AccountId,
 ) {
@@ -326,13 +373,24 @@ fn drain_ready_queue(
         let queued = state.ready_queue.pop_front().unwrap();
 
         // tokio::sync::Semaphore::try_acquire_owned is synchronous.
-        match raw_semaphore.clone().try_acquire_owned() {
+        match account.raw_semaphore.clone().try_acquire_owned() {
             Ok(raw) => {
                 state.running_count += 1;
-                if queued.waker.send(raw).is_err() {
-                    // Waiter was cancelled; the permit inside the oneshot
-                    // is dropped, returning it to the semaphore. Decrement
-                    // and try next.
+                let slot = GrantedSlot {
+                    raw: Some(raw),
+                    account: account.clone(),
+                    account_id: *account_id,
+                };
+                if let Err(slot) = queued.waker.send(slot) {
+                    // Waiter was cancelled before we could hand it the slot.
+                    // We are still holding the state lock, so we must not let
+                    // the returned slot's `Drop` run (it would re-enter this
+                    // path via `try_grant_next_sync` and deadlock). Defuse it,
+                    // return its permit to the semaphore, and account for it
+                    // inline, then try the next queued agent.
+                    if let Some(raw) = slot.defuse() {
+                        drop(raw);
+                    }
                     state.running_count -= 1;
                     debug!(
                         "ConcurrentAgentsScheduler: waiter {} cancelled in account {account_id}, trying next",
diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs
index 53481b4d18..1f6c8313cf 100644
--- a/golem-worker-executor/src/services/active_workers/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/tests.rs
@@ -840,3 +840,223 @@ mod component_module_charge {
         );
     }
 }
+
+// ── ConcurrentAgentsScheduler — model-based liveness property ────────────────
+//
+// The scheduler keeps its own `running_count` integer alongside the real tokio
+// semaphore permits. The two must stay in lockstep: every increment of
+// `running_count` must be matched by exactly one decrement, regardless of how a
+// granted slot is disposed of (released by a live worker, or dropped inside a
+// cancelled waiter's oneshot channel). If they drift, the scheduler wedges —
+// `running_count` sticks at the limit while permits are actually free, and
+// every future acquire queues forever. This is the production deadlock the
+// property is designed to catch.
+//
+// The model drives random interleavings of acquire / release / cancel against
+// the real scheduler and, after every step, asserts the *liveness* invariant:
+// whenever fewer permits are genuinely held than the limit allows, a fresh
+// acquire must succeed promptly. A leaked `running_count` violates this.
+mod scheduler_liveness {
+    use super::super::concurrent_agents_scheduler::{
+        ConcurrentAgentPermit, ConcurrentAgentsScheduler,
+    };
+    use super::{account, agent, resource_entry_with_agent_limit};
+    use proptest::prelude::*;
+    use std::sync::Arc;
+    use std::time::Duration;
+    use test_r::test;
+    use tokio::task::JoinHandle;
+
+    /// One step in a randomized scheduler workload.
+    #[derive(Debug, Clone)]
+    enum Op {
+        /// Acquire a permit and hold it (resolves immediately if capacity is
+        /// free, otherwise the in-flight acquire is parked in `pending`).
+        Acquire,
+        /// Release a currently-held permit, if any.
+        Release(prop::sample::Index),
+        /// Cancel an in-flight (likely queued) acquire, if any. Exercises both
+        /// "cancelled while queued" and "cancelled just after being granted".
+        CancelPending(prop::sample::Index),
+        /// Release a held permit and, in the same step, cancel an in-flight
+        /// acquire. This is the deadly race: the released slot may be granted
+        /// to the in-flight acquire's oneshot and then the acquire is cancelled
+        /// before it can receive it. The slot must still be released.
+        ReleaseThenCancel(prop::sample::Index, prop::sample::Index),
+    }
+
+    fn arb_ops() -> impl Strategy<Value = Vec<Op>> {
+        prop::collection::vec(
+            prop_oneof![
+                3 => Just(Op::Acquire),
+                2 => any::<prop::sample::Index>().prop_map(Op::Release),
+                2 => any::<prop::sample::Index>().prop_map(Op::CancelPending),
+                3 => (any::<prop::sample::Index>(), any::<prop::sample::Index>())
+                    .prop_map(|(a, b)| Op::ReleaseThenCancel(a, b)),
+            ],
+            1..60,
+        )
+    }
+
+    /// Let any synchronous grant/drain bookkeeping triggered by a release or
+    /// cancellation settle before the next observation.
+    async fn settle() {
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        tokio::time::sleep(Duration::from_millis(1)).await;
+    }
+
+    proptest! {
+        // Cap shrink iterations so a failing (buggy) run cannot spend minutes
+        // re-running wedging inputs against the overall timeout while shrinking.
+        #![proptest_config(ProptestConfig { cases: 128, max_shrink_iters: 64, ..ProptestConfig::default() })]
+
+        /// Liveness: under any interleaving of acquire / release / cancel, the
+        /// scheduler never wedges. After each step, if fewer permits are held
+        /// than the limit, a fresh acquire must succeed within a short timeout.
+        /// At the end, draining all held permits must let the account return to
+        /// full capacity.
+        #[test]
+        fn scheduler_never_wedges_under_churn(
+            limit in 1usize..6,
+            ops in arb_ops(),
+        ) {
+            let rt = tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(2)
+                .enable_time()
+                .build()
+                .unwrap();
+
+            rt.block_on(async move {
+                // Bound the whole case so a wedge fails fast and deterministically
+                // rather than hanging the test suite. A correct scheduler completes
+                // a 60-op workload in well under a second; the bug deadlocks here,
+                // so a tight bound makes the failure (and any shrinking) quick.
+                let outcome = tokio::time::timeout(Duration::from_secs(3), async move {
+                    run_workload(limit, ops).await
+                })
+                .await;
+
+                match outcome {
+                    Ok(result) => result,
+                    Err(_elapsed) => Err(TestCaseError::fail(
+                        "scheduler workload did not complete within the overall timeout — \
+                         deadlock (running_count leaked above true occupancy)",
+                    )),
+                }
+            })?;
+        }
+    }
+
+    /// Drives one randomized workload against a freshly-registered account and
+    /// returns `Err` if the liveness invariant is ever violated. Factored out of
+    /// the proptest body so the whole run can be wrapped in an overall timeout.
+    async fn run_workload(limit: usize, ops: Vec<Op>) -> Result<(), TestCaseError> {
+        // Short per-acquire timeout: a wedge must surface quickly, but allow
+        // enough slack for genuine multi-thread scheduling jitter.
+        const PROBE_TIMEOUT: Duration = Duration::from_millis(500);
+
+        let sched = Arc::new(ConcurrentAgentsScheduler::new());
+        let acc = account();
+        sched
+            .register_account(acc, resource_entry_with_agent_limit(limit as u64))
+            .await;
+
+        // Permits we are deliberately holding (count against the limit).
+        let mut held: Vec<ConcurrentAgentPermit> = Vec::new();
+        // In-flight acquires not yet resolved (queued or just granted).
+        let mut pending: Vec<JoinHandle<ConcurrentAgentPermit>> = Vec::new();
+        let mut counter = 0usize;
+
+        for op in ops {
+            match op {
+                Op::Acquire => {
+                    counter += 1;
+                    let sched = sched.clone();
+                    let name = format!("W{counter}");
+                    let handle =
+                        tokio::spawn(async move { sched.acquire(acc, agent(&name)).await });
+                    pending.push(handle);
+                }
+                Op::Release(idx) => {
+                    if !held.is_empty() {
+                        let i = idx.index(held.len());
+                        drop(held.remove(i));
+                    }
+                }
+                Op::CancelPending(idx) => {
+                    if !pending.is_empty() {
+                        let i = idx.index(pending.len());
+                        pending.remove(i).abort();
+                    }
+                }
+                Op::ReleaseThenCancel(ri, ci) => {
+                    if !held.is_empty() {
+                        let i = ri.index(held.len());
+                        drop(held.remove(i));
+                    }
+                    if !pending.is_empty() {
+                        let i = ci.index(pending.len());
+                        pending.remove(i).abort();
+                    }
+                }
+            }
+
+            settle().await;
+
+            // Collect any in-flight acquires that have now resolved into
+            // held permits, so `held.len()` reflects true occupancy.
+            let mut still_pending = Vec::new();
+            for h in pending.drain(..) {
+                if h.is_finished() {
+                    if let Ok(permit) = h.await {
+                        held.push(permit);
+                    }
+                    // Cancelled/aborted handles are simply dropped.
+                } else {
+                    still_pending.push(h);
+                }
+            }
+            pending = still_pending;
+
+            // Liveness invariant: if we are below the limit, a fresh
+            // acquire must succeed promptly. A leaked running_count
+            // would make this hang and trip the timeout.
+            if held.len() < limit {
+                let probe =
+                    tokio::time::timeout(PROBE_TIMEOUT, sched.acquire(acc, agent("probe"))).await;
+                prop_assert!(
+                    probe.is_ok(),
+                    "scheduler wedged: held {} < limit {} but acquire timed out",
+                    held.len(),
+                    limit,
+                );
+                // Release the probe immediately.
+                drop(probe.ok());
+                settle().await;
+            }
+        }
+
+        // Abort everything still queued, drop all held permits, and
+        // confirm the account drains back to full capacity: `limit`
+        // fresh acquires must all succeed.
+        for h in pending.drain(..) {
+            h.abort();
+            let _ = h.await;
+        }
+        held.clear();
+        settle().await;
+
+        let mut drained = Vec::new();
+        for _ in 0..limit {
+            let p = tokio::time::timeout(PROBE_TIMEOUT, sched.acquire(acc, agent("drain"))).await;
+            prop_assert!(
+                p.is_ok(),
+                "scheduler did not return to full capacity after churn",
+            );
+            drained.push(p.unwrap());
+        }
+        Ok(())
+    }
+}

From 183de28520b2fa46a0056558136f80ece917d5ca Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:45:14 -0700
Subject: [PATCH 47/60] feat: use official tokio-metrics crate to expose tokio
 runtime metrics

---
 golem-debugging-service/src/config.rs         |   1 +
 golem-service-base/src/observability.rs       |  29 +++-
 golem-worker-executor/Cargo.toml              |   3 +
 .../config/worker-executor.sample.env         |   3 +
 .../config/worker-executor.toml               |   3 +
 golem-worker-executor/src/identity.rs         |  32 ++++
 golem-worker-executor/src/lib.rs              |  10 +-
 golem-worker-executor/src/metrics.rs          | 164 ++++++++----------
 .../src/services/golem_config.rs              |   6 +
 9 files changed, 153 insertions(+), 98 deletions(-)
 create mode 100644 golem-worker-executor/src/identity.rs

diff --git a/golem-debugging-service/src/config.rs b/golem-debugging-service/src/config.rs
index dc6299652b..6a9869550b 100644
--- a/golem-debugging-service/src/config.rs
+++ b/golem-debugging-service/src/config.rs
@@ -98,6 +98,7 @@ impl DebugConfig {
             max_in_function_retry_delay: std::time::Duration::from_secs(20),
             max_websocket_connections: 100,
             quota_service: QuotaServiceConfig::default(),
+            runtime_metrics_sampling_interval: std::time::Duration::from_secs(5),
         }
     }
 }
diff --git a/golem-service-base/src/observability.rs b/golem-service-base/src/observability.rs
index 98a83dd36e..f9f54b554c 100644
--- a/golem-service-base/src/observability.rs
+++ b/golem-service-base/src/observability.rs
@@ -18,21 +18,42 @@ use axum::response::IntoResponse;
 use axum::routing::get;
 use http::Response;
 use prometheus::{Encoder, Registry, TextEncoder};
+use std::sync::Arc;
 use tokio::net::{TcpListener, ToSocketAddrs};
 use tokio::task::JoinSet;
 use tracing::{Instrument, info};
 
+/// A callback that renders additional metrics in Prometheus text exposition
+/// format, appended to the output of the `prometheus`-crate registry on the
+/// `/metrics` endpoint. Used to surface metrics from a second metrics façade
+/// (e.g. the `metrics`-crate recorder driving tokio-metrics) on the same
+/// scrape endpoint.
+pub type ExtraMetrics = Arc<dyn Fn() -> String + Send + Sync>;
+
 pub async fn start_health_and_metrics_server(
     addr: impl ToSocketAddrs,
     registry: Registry,
     body_message: &'static str,
     join_set: &mut JoinSet<Result<(), anyhow::Error>>,
+) -> Result<u16, anyhow::Error> {
+    start_health_and_metrics_server_with_extra(addr, registry, None, body_message, join_set).await
+}
+
+pub async fn start_health_and_metrics_server_with_extra(
+    addr: impl ToSocketAddrs,
+    registry: Registry,
+    extra: Option<ExtraMetrics>,
+    body_message: &'static str,
+    join_set: &mut JoinSet<Result<(), anyhow::Error>>,
 ) -> Result<u16, anyhow::Error> {
     let app = Router::new()
         .route("/healthcheck", get(move || async move { body_message }))
         .route(
             "/metrics",
-            get(|| async move { prometheus_metrics(registry.clone()) }),
+            get(move || {
+                let extra = extra.clone();
+                async move { prometheus_metrics(registry.clone(), extra) }
+            }),
         );
 
     let listener = TcpListener::bind(addr).await?;
@@ -51,13 +72,17 @@ pub async fn start_health_and_metrics_server(
     Ok(local_addr.port())
 }
 
-pub fn prometheus_metrics(registry: Registry) -> impl IntoResponse {
+pub fn prometheus_metrics(registry: Registry, extra: Option<ExtraMetrics>) -> impl IntoResponse {
     let encoder = TextEncoder::new();
     let mut buffer = Vec::new();
 
     let metric_families = registry.gather();
     encoder.encode(&metric_families, &mut buffer).unwrap();
 
+    if let Some(extra) = extra {
+        buffer.extend_from_slice(extra().as_bytes());
+    }
+
     Response::builder()
         .header("Content-Type", encoder.format_type())
         .body(Body::from(buffer))
diff --git a/golem-worker-executor/Cargo.toml b/golem-worker-executor/Cargo.toml
index bfe2dcafe8..2aa2f37532 100644
--- a/golem-worker-executor/Cargo.toml
+++ b/golem-worker-executor/Cargo.toml
@@ -73,6 +73,8 @@ lazy_static = { workspace = true }
 log = { workspace = true }
 mac_address = { workspace = true, features = ["serde"] }
 md5 = { workspace = true }
+metrics = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
 metrohash = { workspace = true }
 nonempty-collections = { workspace = true }
 nonzero_ext = { workspace = true }
@@ -92,6 +94,7 @@ sqlx-core = { workspace = true }
 sysinfo = { workspace = true }
 tempfile = { workspace = true }
 tokio = { workspace = true }
+tokio-metrics = { workspace = true }
 tokio-stream = { workspace = true }
 tokio-tungstenite = { workspace = true }
 tokio-util = { workspace = true }
diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env
index d3c7a04559..35725ab38f 100644
--- a/golem-worker-executor/config/worker-executor.sample.env
+++ b/golem-worker-executor/config/worker-executor.sample.env
@@ -4,6 +4,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0"
 GOLEM__HTTP_PORT=8082
 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s"
 GOLEM__MAX_WEBSOCKET_CONNECTIONS=100
+GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s"
 GOLEM__TRACING_FILE_NAME_WITH_PORT=true
 GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25
 GOLEM__ACTIVE_WORKERS__TTL="8h"
@@ -215,6 +216,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0"
 GOLEM__HTTP_PORT=8082
 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s"
 GOLEM__MAX_WEBSOCKET_CONNECTIONS=100
+GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s"
 GOLEM__TRACING_FILE_NAME_WITH_PORT=true
 GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25
 GOLEM__ACTIVE_WORKERS__TTL="8h"
@@ -436,6 +438,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0"
 GOLEM__HTTP_PORT=8082
 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s"
 GOLEM__MAX_WEBSOCKET_CONNECTIONS=100
+GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s"
 GOLEM__TRACING_FILE_NAME_WITH_PORT=true
 GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25
 GOLEM__ACTIVE_WORKERS__TTL="8h"
diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml
index e77c5f9bfa..df58d45b50 100644
--- a/golem-worker-executor/config/worker-executor.toml
+++ b/golem-worker-executor/config/worker-executor.toml
@@ -3,6 +3,7 @@ http_address = "0.0.0.0"
 http_port = 8082
 max_in_function_retry_delay = "20s"
 max_websocket_connections = 100
+runtime_metrics_sampling_interval = "5s"
 tracing_file_name_with_port = true
 
 [active_workers]
@@ -333,6 +334,7 @@ without_time = false
 # http_port = 8082
 # max_in_function_retry_delay = "20s"
 # max_websocket_connections = 100
+# runtime_metrics_sampling_interval = "5s"
 # tracing_file_name_with_port = true
 # 
 # [active_workers]
@@ -665,6 +667,7 @@ without_time = false
 # http_port = 8082
 # max_in_function_retry_delay = "20s"
 # max_websocket_connections = 100
+# runtime_metrics_sampling_interval = "5s"
 # tracing_file_name_with_port = true
 # 
 # [active_workers]
diff --git a/golem-worker-executor/src/identity.rs b/golem-worker-executor/src/identity.rs
new file mode 100644
index 0000000000..e2f95b0cae
--- /dev/null
+++ b/golem-worker-executor/src/identity.rs
@@ -0,0 +1,32 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Process/pod identity for this worker-executor instance.
+//!
+//! The identity is derived from the `POD_NAME` env var, falling back to
+//! `HOSTNAME`, then `"unknown"`, resolved once and cached for the lifetime of
+//! the process. It is used both as the `executor_id` metric label and anywhere
+//! else the running instance needs to identify itself.
+
+/// Returns the stable identity of this worker-executor instance.
+///
+/// Resolved once on first call and cached for the lifetime of the process.
+pub fn executor_id() -> &'static str {
+    static EXECUTOR_ID: std::sync::OnceLock<String> = std::sync::OnceLock::new();
+    EXECUTOR_ID.get_or_init(|| {
+        std::env::var("POD_NAME")
+            .or_else(|_| std::env::var("HOSTNAME"))
+            .unwrap_or_else(|_| "unknown".to_string())
+    })
+}
diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs
index 377123ddf7..b9ecc4f640 100644
--- a/golem-worker-executor/src/lib.rs
+++ b/golem-worker-executor/src/lib.rs
@@ -16,6 +16,7 @@ pub mod bootstrap;
 pub mod config;
 pub mod durable_host;
 pub mod grpc;
+pub mod identity;
 pub mod metrics;
 pub mod model;
 pub mod preview2;
@@ -1061,15 +1062,18 @@ pub async fn bootstrap_and_run_worker_executor<
 
     let leak_detector = worker_executor_impl.leak_detector();
 
-    join_set.spawn(crate::metrics::runtime::run_runtime_metrics_loop(
+    let runtime_metrics = crate::metrics::runtime::install_runtime_metrics(
         runtime.clone(),
-    ));
+        golem_config.runtime_metrics_sampling_interval,
+        join_set,
+    );
 
     let grpc_port = run_grpc_server(worker_executor_impl, lazy_worker_activator, join_set).await?;
 
-    let http_port = golem_service_base::observability::start_health_and_metrics_server(
+    let http_port = golem_service_base::observability::start_health_and_metrics_server_with_extra(
         golem_config.http_addr()?,
         prometheus_registry,
+        runtime_metrics,
         "Worker executor is running",
         join_set,
     )
diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs
index 237c4cb0f4..980ae3842d 100644
--- a/golem-worker-executor/src/metrics.rs
+++ b/golem-worker-executor/src/metrics.rs
@@ -114,95 +114,79 @@ pub mod component {
 }
 
 pub mod runtime {
-    use std::time::{Duration, Instant};
+    use std::sync::Arc;
+    use std::time::Duration;
 
-    use lazy_static::lazy_static;
-    use prometheus::*;
+    use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle};
     use tokio::runtime::Handle;
+    use tokio::task::JoinSet;
+    use tokio_metrics::RuntimeMetricsReporterBuilder;
 
-    lazy_static! {
-        /// Number of tasks currently sitting in the tokio runtime's global
-        /// (injection) queue: runnable but not yet polled by any worker thread.
-        /// A persistently non-zero value means ready tasks (including I/O
-        /// continuations such as DB-response handling) are waiting for a worker
-        /// thread, which inflates I/O latency metrics even when the underlying
-        /// I/O is fast.
-        static ref GLOBAL_QUEUE_DEPTH: IntGauge = register_int_gauge!(
-            "executor_tokio_global_queue_depth",
-            "Tasks scheduled in the tokio runtime global queue, runnable but not yet polled"
-        )
-        .unwrap();
-
-        /// Number of worker threads in the multi-thread runtime.
-        static ref NUM_WORKERS: IntGauge = register_int_gauge!(
-            "executor_tokio_num_workers",
-            "Number of tokio runtime worker threads"
-        )
-        .unwrap();
-
-        /// Current number of alive tasks in the runtime.
-        static ref NUM_ALIVE_TASKS: IntGauge = register_int_gauge!(
-            "executor_tokio_num_alive_tasks",
-            "Number of alive tasks in the tokio runtime"
-        )
-        .unwrap();
+    /// How often the recorder's upkeep runs to keep its internal storage
+    /// bounded (e.g. pruning idle metrics once an idle timeout is configured).
+    const UPKEEP_INTERVAL: Duration = Duration::from_secs(30);
 
-        /// Per-worker busy ratio over the last sampling interval: the fraction
-        /// of wall-clock time the worker spent executing tasks. A value near 1.0
-        /// means the worker is CPU-saturated and cannot promptly poll newly
-        /// ready tasks.
-        static ref WORKER_BUSY_RATIO: GaugeVec = register_gauge_vec!(
-            "executor_tokio_worker_busy_ratio",
-            "Fraction of wall-clock time each tokio worker spent busy over the sampling interval",
-            &["worker"]
-        )
-        .unwrap();
-    }
-
-    /// Background loop that samples stable tokio runtime metrics and exports them
-    /// to Prometheus.
+    /// Installs a dedicated `metrics`-crate Prometheus recorder for tokio
+    /// runtime metrics, spawns the tokio-metrics reporter on `join_set`, and
+    /// returns a renderer that emits the collected metrics in Prometheus text
+    /// format.
     ///
-    /// All metrics used here are stable as of tokio 1.45 (the workspace resolves
-    /// 1.50+), so this requires neither the `tokio_unstable` cfg nor any build
-    /// flag. `global_queue_depth` is the primary diagnostic for runtime
-    /// scheduling pressure; `worker_busy_ratio` corroborates it by showing
-    /// per-worker CPU saturation. Never returns.
-    pub async fn run_runtime_metrics_loop(handle: Handle) -> anyhow::Result<()> {
-        const INTERVAL: Duration = Duration::from_secs(5);
-
-        let metrics = handle.metrics();
-        let num_workers = metrics.num_workers();
-        NUM_WORKERS.set(num_workers as i64);
-
-        // Previous cumulative busy duration per worker, for computing the busy
-        // ratio over each interval.
-        let mut prev_busy: Vec<Duration> = (0..num_workers)
-            .map(|w| metrics.worker_total_busy_duration(w))
-            .collect();
-        let mut prev_instant = Instant::now();
-
-        let mut interval = tokio::time::interval(INTERVAL);
-        interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
-        loop {
-            interval.tick().await;
-
-            GLOBAL_QUEUE_DEPTH.set(metrics.global_queue_depth() as i64);
-            NUM_ALIVE_TASKS.set(metrics.num_alive_tasks() as i64);
-
-            let now = Instant::now();
-            let elapsed = now.duration_since(prev_instant).as_secs_f64();
-            prev_instant = now;
-            if elapsed > 0.0 {
-                for (w, prev) in prev_busy.iter_mut().enumerate() {
-                    let busy = metrics.worker_total_busy_duration(w);
-                    let delta = busy.saturating_sub(*prev).as_secs_f64();
-                    *prev = busy;
-                    WORKER_BUSY_RATIO
-                        .with_label_values(&[&w.to_string()])
-                        .set((delta / elapsed).min(1.0));
-                }
+    /// `sampling_interval` controls how often metrics are sampled from the
+    /// runtime into the recorder; Prometheus scrapes the rendered values
+    /// independently.
+    ///
+    /// The returned closure is appended to the `prometheus`-crate scrape output
+    /// on the shared `/metrics` endpoint, so all `tokio_*` series appear on the
+    /// same endpoint as the rest of the executor's metrics, carrying the same
+    /// `executor_id` label.
+    ///
+    /// Returns `None` if a global `metrics` recorder is already installed (which
+    /// should not happen in the executor), in which case runtime metrics are
+    /// simply not exported.
+    pub fn install_runtime_metrics(
+        runtime: Handle,
+        sampling_interval: Duration,
+        join_set: &mut JoinSet<anyhow::Result<()>>,
+    ) -> Option<Arc<dyn Fn() -> String + Send + Sync>> {
+        let executor_id = crate::identity::executor_id();
+
+        let handle: PrometheusHandle = match PrometheusBuilder::new()
+            .add_global_label("executor_id", executor_id)
+            .install_recorder()
+        {
+            Ok(handle) => handle,
+            Err(err) => {
+                tracing::warn!(
+                    "Failed to install tokio runtime metrics recorder, runtime metrics will not be exported: {err}"
+                );
+                return None;
             }
-        }
+        };
+
+        let reporter = RuntimeMetricsReporterBuilder::default().with_interval(sampling_interval);
+        join_set.spawn_on(
+            async move {
+                reporter.describe_and_run().await;
+                Ok(())
+            },
+            &runtime,
+        );
+
+        // Run periodic upkeep so the recorder's internal storage stays bounded.
+        let upkeep_handle = handle.clone();
+        join_set.spawn_on(
+            async move {
+                let mut interval = tokio::time::interval(UPKEEP_INTERVAL);
+                interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+                loop {
+                    interval.tick().await;
+                    upkeep_handle.run_upkeep();
+                }
+            },
+            &runtime,
+        );
+
+        Some(Arc::new(move || handle.render()))
     }
 }
 
@@ -874,16 +858,10 @@ pub mod storage {
     use lazy_static::lazy_static;
     use prometheus::*;
 
-    /// Returns the executor identity label: POD_NAME env var, falling back to HOSTNAME, then "unknown".
-    /// Resolved once on first call and cached for the lifetime of the process.
-    pub fn executor_id() -> &'static str {
-        static EXECUTOR_ID: std::sync::OnceLock<String> = std::sync::OnceLock::new();
-        EXECUTOR_ID.get_or_init(|| {
-            std::env::var("POD_NAME")
-                .or_else(|_| std::env::var("HOSTNAME"))
-                .unwrap_or_else(|_| "unknown".to_string())
-        })
-    }
+    /// Re-exported from [`crate::identity`], which owns the process identity.
+    /// Kept here so existing metric-recording call sites can keep using
+    /// `crate::metrics::storage::executor_id()`.
+    pub use crate::identity::executor_id;
 
     lazy_static! {
         pub static ref STORAGE_FILESYSTEM_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!(
diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs
index a11a411f77..946b20dae6 100644
--- a/golem-worker-executor/src/services/golem_config.rs
+++ b/golem-worker-executor/src/services/golem_config.rs
@@ -73,6 +73,11 @@ pub struct GolemConfig {
     pub max_websocket_connections: usize,
     pub http_address: String,
     pub http_port: u16,
+    /// How often tokio runtime metrics are sampled from the runtime and pushed
+    /// into the metrics recorder exposed on `/metrics`. Prometheus scrapes the
+    /// rendered values independently; this is the in-process resolution.
+    #[serde(with = "humantime_serde")]
+    pub runtime_metrics_sampling_interval: Duration,
 }
 
 impl SafeDisplay for GolemConfig {
@@ -284,6 +289,7 @@ impl Default for GolemConfig {
             max_websocket_connections: 100,
             http_address: "0.0.0.0".to_string(),
             http_port: 8082,
+            runtime_metrics_sampling_interval: Duration::from_secs(5),
         }
     }
 }

From 78d311d151ca40ffc3ab17de207df5f50009eff7 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:50:05 -0700
Subject: [PATCH 48/60] feat: use official tokio-metrics crate to expose tokio
 runtime metrics vol 2

---
 Cargo.lock | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 Cargo.toml |   3 ++
 2 files changed, 107 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 9296de06e0..3239afe40f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4277,6 +4277,8 @@ dependencies = [
  "log",
  "mac_address",
  "md5",
+ "metrics",
+ "metrics-exporter-prometheus",
  "metrohash",
  "nonempty-collections",
  "nonzero_ext",
@@ -4301,6 +4303,7 @@ dependencies = [
  "tempfile",
  "test-r",
  "tokio",
+ "tokio-metrics",
  "tokio-stream",
  "tokio-tungstenite 0.25.0",
  "tokio-util",
@@ -6007,6 +6010,46 @@ dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "metrics"
+version = "0.24.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89550ee9f79e88fef3119de263694973a8adb26c21d75322164fb8c493039fe2"
+dependencies = [
+ "portable-atomic",
+ "rapidhash",
+]
+
+[[package]]
+name = "metrics-exporter-prometheus"
+version = "0.16.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd7399781913e5393588a8d8c6a2867bf85fb38eaf2502fdce465aad2dc6f034"
+dependencies = [
+ "base64 0.22.1",
+ "indexmap 2.14.0",
+ "metrics",
+ "metrics-util",
+ "quanta",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "metrics-util"
+version = "0.19.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8496cc523d1f94c1385dd8f0f0c2c480b2b8aeccb5b7e4485ad6365523ae376"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+ "hashbrown 0.15.5",
+ "metrics",
+ "quanta",
+ "rand 0.9.2",
+ "rand_xoshiro",
+ "sketches-ddsketch",
+]
+
 [[package]]
 name = "metrohash"
 version = "1.0.7"
@@ -7669,6 +7712,21 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "quanta"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "once_cell",
+ "raw-cpuid",
+ "wasi",
+ "web-sys",
+ "winapi",
+]
+
 [[package]]
 name = "quick-error"
 version = "1.2.3"
@@ -7856,6 +7914,15 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "rand_xoshiro"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41"
+dependencies = [
+ "rand_core 0.9.5",
+]
+
 [[package]]
 name = "range-set-blaze"
 version = "0.1.16"
@@ -7868,6 +7935,24 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "rapidhash"
+version = "4.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "raw-cpuid"
+version = "11.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
+dependencies = [
+ "bitflags 2.11.1",
+]
+
 [[package]]
 name = "rayon"
 version = "1.11.0"
@@ -9103,6 +9188,12 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e"
 
+[[package]]
+name = "sketches-ddsketch"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b"
+
 [[package]]
 name = "slab"
 version = "0.4.12"
@@ -9946,6 +10037,19 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "tokio-metrics"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9e81d53caf955549b1dec7af4ac2149e94cc25ed97b4a545151140281e2f528"
+dependencies = [
+ "futures-util",
+ "metrics",
+ "pin-project-lite",
+ "tokio",
+ "tokio-stream",
+]
+
 [[package]]
 name = "tokio-native-tls"
 version = "0.3.1"
diff --git a/Cargo.toml b/Cargo.toml
index b6ba881258..eac4fa4458 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -154,6 +154,8 @@ log = "0.4.26"
 mac_address = "1.1.8"
 mappable-rc = "0.1.1"
 md5 = "0.7.0"
+metrics = "0.24.2"
+metrics-exporter-prometheus = { version = "0.16.2", default-features = false }
 metrohash = "1.0.7"
 miette = { version = "7.6.0", features = ["fancy"] }
 mime = "0.3.17"
@@ -248,6 +250,7 @@ textwrap = "0.16.1"
 thiserror = "2.0.12"
 time = { version = "0.3.41", features = ["default", "macros"] }
 tokio = { version = "1.44", features = ["macros", "rt-multi-thread", "sync", "io-std", "net", "tracing", "process", "signal"] }
+tokio-metrics = { version = "0.5.0", features = ["metrics-rs-integration"] }
 tokio-postgres = "0.7.13"
 tokio-rustls = { version = "0.26.2" }
 tokio-stream = { version = "0.1", features = ["sync"] }

From aec411b7275491efe6311313513df5d2662f12c3 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:59:53 -0700
Subject: [PATCH 49/60] chore: cleanup comments

---
 .../concurrent_agents_scheduler.rs            | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs
index 2391fce697..3d20d187b6 100644
--- a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs
+++ b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs
@@ -55,22 +55,18 @@ struct QueuedAgent {
 /// the responsibility to decrement the account's `running_count` and wake the
 /// next queued agent when it is released.
 ///
-/// Crucially, the `running_count` was incremented *together with* acquiring the
-/// raw permit, and the matching decrement lives **only** here in `Drop`. This
-/// binds the count strictly to the lifetime of the granted permit, regardless
-/// of how the slot is ultimately disposed of:
+/// The `running_count` is incremented together with acquiring the raw permit,
+/// and the matching decrement lives only here in `Drop`. This binds the count
+/// strictly to the lifetime of the granted permit, regardless of how the slot
+/// is disposed of:
 ///
-/// * It is moved into a [`ConcurrentAgentPermit`] and dropped when the agent
+/// * it is moved into a [`ConcurrentAgentPermit`] and dropped when the agent
 ///   releases the slot (the normal case), or
 /// * it is sent into a queued waiter's oneshot and that waiter is cancelled
-///   before receiving it — the slot is then dropped *inside* the channel.
+///   before receiving it — the slot is then dropped inside the channel.
 ///
 /// Both paths run this same `Drop`, so a slot granted to a waiter that is
-/// cancelled after the grant succeeded cannot leak the count. (A previous
-/// design decremented only when the oneshot `send` failed, which left
-/// `running_count` permanently inflated when a waiter was cancelled *after* a
-/// successful send — wedging the whole account once the count reached the
-/// limit.)
+/// cancelled after the grant succeeded cannot leak the count.
 struct GrantedSlot {
     raw: Option<OwnedSemaphorePermit>,
     account: Arc<AccountScheduler>,
@@ -359,7 +355,7 @@ fn try_grant_next_sync(account: &Arc<AccountScheduler>, account_id: &AccountId)
 /// uses `try_acquire_owned` which does not block.
 ///
 /// Each granted permit is wrapped in a [`GrantedSlot`] carrying the
-/// `running_count` decrement, so a waiter cancelled *after* a successful send
+/// `running_count` decrement, so a waiter cancelled after a successful send
 /// still releases its slot (via the slot's `Drop` when the oneshot channel is
 /// dropped) rather than leaking the count. The increment here is matched
 /// one-for-one by that slot's `Drop`.

From 71bee784c2bcd44a42061c2e897be5036810f458 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 12 Jun 2026 22:05:30 -0700
Subject: [PATCH 50/60] feat: try mimalloc

---
 golem-worker-executor/Cargo.toml    | 1 +
 golem-worker-executor/src/server.rs | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/golem-worker-executor/Cargo.toml b/golem-worker-executor/Cargo.toml
index 2aa2f37532..9e83a12c1f 100644
--- a/golem-worker-executor/Cargo.toml
+++ b/golem-worker-executor/Cargo.toml
@@ -76,6 +76,7 @@ md5 = { workspace = true }
 metrics = { workspace = true }
 metrics-exporter-prometheus = { workspace = true }
 metrohash = { workspace = true }
+mimalloc = { workspace = true }
 nonempty-collections = { workspace = true }
 nonzero_ext = { workspace = true }
 pgvector = { workspace = true }
diff --git a/golem-worker-executor/src/server.rs b/golem-worker-executor/src/server.rs
index fbd1c7e60c..18b286adcb 100644
--- a/golem-worker-executor/src/server.rs
+++ b/golem-worker-executor/src/server.rs
@@ -21,6 +21,9 @@ use std::sync::Arc;
 use tokio::task::JoinSet;
 use tracing::info;
 
+#[global_allocator]
+static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
 fn main() -> Result<(), anyhow::Error> {
     match make_config_loader().load_or_dump_config() {
         Some(mut config) => {

From b14682347ce50688ecc32e7109a3ecb862d3dfe1 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 12 Jun 2026 22:06:54 -0700
Subject: [PATCH 51/60] feat: try mimalloc vol 2

---
 Cargo.lock | 19 +++++++++++++++++++
 Cargo.toml |  1 +
 2 files changed, 20 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 3239afe40f..1821488058 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4280,6 +4280,7 @@ dependencies = [
  "metrics",
  "metrics-exporter-prometheus",
  "metrohash",
+ "mimalloc",
  "nonempty-collections",
  "nonzero_ext",
  "pgvector",
@@ -5738,6 +5739,15 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 
+[[package]]
+name = "libmimalloc-sys"
+version = "0.1.49"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a45a52f43e1c16f667ccfe4dd8c85b7f7c204fd5e3bf46c5b0db9a5c3c0b8e9"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "libredox"
 version = "0.1.15"
@@ -6086,6 +6096,15 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "mimalloc"
+version = "0.1.52"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d4139bb28d14ad1facf21d5eb8825051b326e172d216b39f6d31df53cc97862"
+dependencies = [
+ "libmimalloc-sys",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
diff --git a/Cargo.toml b/Cargo.toml
index eac4fa4458..fbe1e4866e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -160,6 +160,7 @@ metrohash = "1.0.7"
 miette = { version = "7.6.0", features = ["fancy"] }
 mime = "0.3.17"
 mime_guess = "2.0.5"
+mimalloc = "0.1.52"
 minijinja = "2.7.0"
 
 nanoid = "0.4.0"

From 0ef2c16ce7bdf104479720011d199b9364acc87b Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 12 Jun 2026 22:56:33 -0700
Subject: [PATCH 52/60] perf: enable thin LTO and codegen-units=1 for release
 builds

---
 Cargo.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index fbe1e4866e..e9ba81fc0c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -341,6 +341,8 @@ debug = "line-tables-only"
 
 [profile.release]
 panic = "abort"
+lto = "thin"
+codegen-units = 1
 
 [profile.benchmarks]
 inherits = "release"

From 99e3633aa065990da6259798bdfb30d704215f10 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 12 Jun 2026 23:11:41 -0700
Subject: [PATCH 53/60] perf: pin target-cpu baseline for published images
 (x86-64-v3, neoverse-n1)

---
 .github/workflows/ci.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 095bdbd015..7a8d2fb9f1 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -836,10 +836,14 @@ jobs:
           - platform: linux/amd64
             name: linux/amd64
             target: x86_64-unknown-linux-gnu
+            target_cpu_env: CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS
+            target_cpu: "-C target-cpu=x86-64-v3"
           - platform: linux/arm64
             name: linux/arm64
             target: aarch64-unknown-linux-gnu
             cross: true
+            target_cpu_env: CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUSTFLAGS
+            target_cpu: "-C target-cpu=neoverse-n1"
     name: docker-targets-build (${{ matrix.platform.platform }})
     steps:
       - uses: actions/checkout@v5
@@ -854,6 +858,12 @@ jobs:
         run: |
           platform=${{ matrix.platform.platform }}
           echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
+          # Target-scoped rustflags pin the instruction-set baseline for the
+          # published images: x86-64-v3 on amd64, Neoverse-N1 on arm64. The
+          # per-target CARGO_TARGET_*_RUSTFLAGS form is used because plain
+          # RUSTFLAGS is ignored when cross-compiling; CARGO_-prefixed vars are
+          # also passed through into the cross container automatically.
+          echo "${{ matrix.platform.target_cpu_env }}=${{ matrix.platform.target_cpu }}" >> $GITHUB_ENV
 
       - run: cargo install cross
         if: ${{ matrix.platform.cross }}

From 3218b4f8dc4acc2260a1341347c4eed117f0b4db Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Fri, 12 Jun 2026 23:25:38 -0700
Subject: [PATCH 54/60] perf: drop codegen-units=1, keep thin LTO

The codegen-units=1 runtime gain is marginal once thin LTO is enabled,
while it noticeably increases release build time. Revert to the default
to keep intra-crate parallelism.
---
 Cargo.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index e9ba81fc0c..32b71fd001 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -342,7 +342,6 @@ debug = "line-tables-only"
 [profile.release]
 panic = "abort"
 lto = "thin"
-codegen-units = 1
 
 [profile.benchmarks]
 inherits = "release"

From ac3b3ee6145d9a5269e71af02ce5bb1049d9677d Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sat, 13 Jun 2026 20:29:38 -0700
Subject: [PATCH 55/60] chore: lower number of cuncurrent agents to 200 in case
 of durability overhead test

---
 integration-tests/benchmark_suites/cloud-perf.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index ea8ce74403..9253d128ad 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -126,5 +126,5 @@ benchmarks:
   - name: durability-overhead
     iterations: 1
     clusterSize: [2]
-    size: [10, 50, 100, 250]
+    size: [10, 50, 100, 200]
     length: [5000]

From ee49d867f0d7b6ff4d9b56f51155ff8ceb9ad97c Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sun, 14 Jun 2026 02:44:31 -0700
Subject: [PATCH 56/60] feat: restore 3 iterations

---
 .../benchmark_suites/cloud-perf.yaml          | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
index 9253d128ad..ef8dd7d61f 100644
--- a/integration-tests/benchmark_suites/cloud-perf.yaml
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -29,7 +29,7 @@ benchmarks:
   # size   = number of workers per implementation (×6 implementations total)
   # length = unused for echo
   - name: throughput-echo
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [1, 50, 100, 250]
     length: [1000]
@@ -40,7 +40,7 @@ benchmarks:
   # benchmark most relevant to the memory-admission investigation — sized to
   # match throughput-echo so it exercises real density.
   - name: throughput-large-input
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [1, 50, 100, 250]
     length: [100, 10000]
@@ -48,7 +48,7 @@ benchmarks:
   # size   = number of workers per implementation
   # length = CPU work length passed to cpu_intensive
   - name: throughput-cpu-intensive
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [1, 50, 100, 250]
     length: [100]
@@ -58,14 +58,14 @@ benchmarks:
   # size   = number of unique components created (each in its own env)
   # length = seconds to wait per component for pre-compilation warm-up
   - name: cold-start-unknown-small
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [1, 5, 25, 50]
     length: [2]
     disableCompilationCache: true
 
   - name: cold-start-unknown-medium
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [1, 5, 25, 50]
     length: [5]
@@ -78,13 +78,13 @@ benchmarks:
   # NOTE: if results here are close to the cache-disabled entries above, the
   # warm-up wait is too short and compilation hasn't finished — bump length.
   - name: cold-start-unknown-small
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [1, 5, 25, 50]
     length: [2]
 
   - name: cold-start-unknown-medium
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [1, 5, 25, 50]
     length: [5]
@@ -94,13 +94,13 @@ benchmarks:
   # size   = number of workers created
   # length = number of hot invocations per worker after the first cold one
   - name: latency-small
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [100, 500, 1000, 2000, 5000]
     length: [2]
 
   - name: latency-medium
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [100, 500, 1000, 2000]
     length: [5]
@@ -112,7 +112,7 @@ benchmarks:
   # size   = number of workers launched in parallel
   # length = sleep duration in milliseconds
   - name: sleep
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [10, 100, 500, 1000, 2000]
     length: [10000]
@@ -124,7 +124,7 @@ benchmarks:
   # size   = number of workers per variant
   # length = loop iteration count passed to oplog_heavy
   - name: durability-overhead
-    iterations: 1
+    iterations: 3
     clusterSize: [2]
     size: [10, 50, 100, 200]
     length: [5000]

From e0bdf30affd862d6d83c101b727097d80e37fbdd Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sun, 14 Jun 2026 14:14:48 -0700
Subject: [PATCH 57/60] fix: release worker memory grant on cancelled start

---
 .../services/active_workers/admission/mod.rs  | 117 +++++++--
 .../src/services/active_workers/mod.rs        |  56 ++---
 .../src/services/active_workers/tests.rs      | 222 ++++++++++++++++++
 .../src/services/golem_config.rs              |   4 +-
 golem-worker-executor/src/worker/mod.rs       |  86 ++++---
 5 files changed, 408 insertions(+), 77 deletions(-)

diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs
index e9ca7f7079..02175c11ee 100644
--- a/golem-worker-executor/src/services/active_workers/admission/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs
@@ -37,10 +37,15 @@
 //! both the burst race and later faulting of granted pages.
 //!
 //! The granted total is maintained by two integer updates: a worker's grant is
-//! added on admission and removed on unload (via [`AdmissionController::release`]
-//! from the worker lifecycle). The headroom check re-derives the reservation
-//! from this maintained total and the current probe reading, so it is O(1) and
-//! exact regardless of worker churn.
+//! added on admission, and removed when the [`MemoryGrant`] guard returned by
+//! admission is dropped. Tying the removal to the guard's drop — rather than to
+//! an explicit release call on some worker-lifecycle path — keeps the accounting
+//! symmetric no matter how a worker's start ends: whether it becomes resident and
+//! later stops, or its start is cancelled mid-flight (e.g. the worker is deleted
+//! while still waiting for permits), dropping the guard returns its reservation
+//! exactly once. The headroom check re-derives the reservation from the
+//! maintained total and the current probe reading, so it is O(1) and exact
+//! regardless of worker churn.
 //!
 //! When headroom is short the controller evicts already-resident idle-then-warm
 //! work; if it still cannot make room it rejects rather than over-committing.
@@ -51,12 +56,12 @@
 
 use super::memory_probe::MemoryProbe;
 use async_trait::async_trait;
-use std::sync::Mutex;
+use std::sync::{Arc, Mutex};
 
 /// Why an eviction candidate is worth evicting, in priority order. Lower
 /// variants are evicted first.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
-pub enum EvictionPriority {
+pub(crate) enum EvictionPriority {
     /// Resident in memory, not executing, no durable pending work. Cheapest to
     /// evict — losing it costs at most a re-load on next use.
     Idle,
@@ -69,7 +74,7 @@ pub enum EvictionPriority {
 /// restore headroom. Abstracts over the live worker set so the decision logic
 /// is testable without `Worker`/wasmtime.
 #[async_trait]
-pub trait EvictionSource: Send + Sync {
+pub(crate) trait EvictionSource: Send + Sync {
     /// Evict at the given priority tier, attempting to free at least
     /// `needed_bytes`. Returns the number of bytes actually reclaimed (which may
     /// be less if the tier is exhausted, or more if a single victim was larger
@@ -80,7 +85,7 @@ pub trait EvictionSource: Send + Sync {
 
 /// The outcome of an admission attempt.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum AdmissionDecision {
+pub(crate) enum AdmissionDecision {
     /// There is enough real headroom (possibly after eviction) to admit the
     /// request without risking the limit.
     Admit,
@@ -96,7 +101,7 @@ pub enum AdmissionDecision {
 ///   arenas, runtime buffers). Mirrors `worker_memory_ratio`, but applied to the
 ///   measured limit rather than the configured total.
 #[derive(Debug, Clone, Copy)]
-pub struct AdmissionPolicy {
+pub(crate) struct AdmissionPolicy {
     /// Fraction (0.0..=1.0) of the measured limit usable for WASM admission.
     pub usable_ratio: f64,
 }
@@ -106,7 +111,7 @@ pub struct AdmissionPolicy {
 /// probe on each call. The only retained state is `granted`: the total linear
 /// memory granted to live workers, maintained across admit and unload, which the
 /// gate reserves so a worker cannot OOM the node by faulting in granted pages.
-pub struct AdmissionController {
+pub(crate) struct AdmissionController {
     probe: Box<dyn MemoryProbe>,
     policy: AdmissionPolicy,
     granted: Mutex<u64>,
@@ -181,7 +186,7 @@ impl AdmissionController {
     /// becomes resident and shared by all its workers. Unlike admission this does
     /// not evict or reject (the worker is already in); it accounts the bytes so
     /// later admissions see them. Released with [`Self::release`].
-    pub fn reserve_committed(&self, bytes: u64) {
+    pub(crate) fn reserve_committed(&self, bytes: u64) {
         self.reserve(bytes);
     }
 
@@ -189,7 +194,7 @@ impl AdmissionController {
     /// granted. Its pages leave memory, so its grant no longer needs reserving;
     /// not releasing it would permanently shrink admissible headroom as workers
     /// come and go.
-    pub fn release(&self, reserved_bytes: u64) {
+    pub(crate) fn release(&self, reserved_bytes: u64) {
         let mut granted = self.granted.lock().unwrap();
         *granted = granted.saturating_sub(reserved_bytes);
         crate::metrics::workers::record_worker_memory_granted(*granted);
@@ -211,7 +216,7 @@ impl AdmissionController {
     /// headroom is re-measured against ground truth; the request is admitted only
     /// if the real headroom now covers it, otherwise it is rejected. On admit the
     /// request is added to the in-flight reservation.
-    pub async fn try_admit(
+    async fn try_admit(
         &self,
         request_bytes: u64,
         source: &dyn EvictionSource,
@@ -244,11 +249,91 @@ impl AdmissionController {
         }
     }
 
-    /// The current admissible headroom. Exposed for metrics and for callers that
-    /// want to make their own pre-check.
-    pub fn headroom_bytes(&self) -> u64 {
+    /// The current admissible headroom. Used by tests to assert the gate's
+    /// accounting; production reads headroom indirectly through admission.
+    #[cfg(test)]
+    pub(crate) fn headroom_bytes(&self) -> u64 {
         self.admissible_headroom()
     }
+
+    /// Like [`Self::try_admit`], but on admit returns a [`MemoryGrant`] guard
+    /// that owns the reservation and releases it on drop. The grant a starting
+    /// worker holds passes through several `.await` points before the worker
+    /// becomes resident (per-account concurrency, component charge, filesystem
+    /// storage); if that work is cancelled — as when the worker is deleted while
+    /// still waiting — the guard's drop returns the reservation, so a cancelled
+    /// start cannot leak headroom.
+    pub(crate) async fn try_admit_grant(
+        self: &Arc<Self>,
+        request_bytes: u64,
+        source: &dyn EvictionSource,
+    ) -> Option<MemoryGrant> {
+        match self.try_admit(request_bytes, source).await {
+            AdmissionDecision::Admit => Some(MemoryGrant {
+                controller: Some(self.clone()),
+                bytes: request_bytes,
+            }),
+            AdmissionDecision::Reject => None,
+        }
+    }
+}
+
+/// Owns a memory reservation made with the [`AdmissionController`] and returns it
+/// to the gate when dropped, so a reservation is released exactly once regardless
+/// of whether the worker became resident or its start was cancelled.
+///
+/// When measured admission is disabled (no controller) the grant is inert: it
+/// reserves nothing and releasing it is a no-op, so callers can hold a grant
+/// uniformly without branching on whether admission is active.
+pub(crate) struct MemoryGrant {
+    controller: Option<Arc<AdmissionController>>,
+    bytes: u64,
+}
+
+impl MemoryGrant {
+    /// An inert grant for when measured admission is disabled: holds no
+    /// reservation and releases nothing on drop.
+    pub(crate) fn inert() -> Self {
+        Self {
+            controller: None,
+            bytes: 0,
+        }
+    }
+
+    /// Fold another grant's bytes into this one, so a worker that grows its
+    /// memory carries a single grant covering its whole reservation. The other
+    /// grant is consumed and its reservation transferred here; the combined total
+    /// is released exactly once when this grant drops.
+    pub(crate) fn merge(&mut self, mut other: MemoryGrant) {
+        if other.controller.is_some() {
+            // Adopt the controller so a merged grant acquired while admission was
+            // enabled still releases, even if `self` started inert.
+            if self.controller.is_none() {
+                self.controller = other.controller.take();
+            }
+            self.bytes += other.bytes;
+        }
+        // Neutralize the absorbed grant so its drop does not release the bytes
+        // now owned by `self`.
+        other.bytes = 0;
+        other.controller = None;
+    }
+}
+
+impl std::fmt::Debug for MemoryGrant {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("MemoryGrant")
+            .field("bytes", &self.bytes)
+            .finish()
+    }
+}
+
+impl Drop for MemoryGrant {
+    fn drop(&mut self) {
+        if let Some(controller) = &self.controller {
+            controller.release(self.bytes);
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs
index 4ac70f8744..9cccc56112 100644
--- a/golem-worker-executor/src/services/active_workers/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/mod.rs
@@ -29,7 +29,8 @@ pub use fs_semaphore::{
     filesystem_storage_permits_to_bytes, filesystem_storage_pool_bytes_to_permits,
 };
 
-use admission::{AdmissionController, AdmissionDecision, EvictionPriority, EvictionSource};
+pub(crate) use admission::MemoryGrant;
+use admission::{AdmissionController, EvictionPriority, EvictionSource};
 use async_trait::async_trait;
 pub use component_charge::HeldComponentCharge;
 use component_charge::{ChargeSource, ComponentChargeGuard, ComponentChargeRegistry};
@@ -235,7 +236,10 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
     }
 
     /// Blocking memory admission for a starting worker. Loops until the gate
-    /// admits the request, backing off between attempts.
+    /// admits the request, backing off between attempts, and returns a
+    /// [`MemoryGrant`] guard owning the reservation: the worker holds it for as
+    /// long as it is resident and releases it by dropping the guard, so a start
+    /// cancelled before the worker becomes resident cannot leak the reservation.
     ///
     /// A rejection is transient, not terminal. The gate reads resident memory
     /// from the probe, which lags real usage (cgroup `memory.current` only counts
@@ -244,18 +248,19 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
     /// Each iteration backs off and re-reads the gate, so the caller eventually
     /// proceeds once headroom recovers rather than failing under momentary
     /// pressure. With measured admission disabled the worker is admitted
-    /// immediately.
-    pub async fn acquire(&self, memory: u64) {
+    /// immediately with an inert grant.
+    pub(crate) async fn acquire(&self, memory: u64) -> MemoryGrant {
         let Some(admission) = &self.admission else {
-            return;
+            return MemoryGrant::inert();
         };
         loop {
             // Evicts idle-then-warm when real headroom is short; rejects (and we
             // back off) when it cannot make room rather than risking the limit.
-            if admission.try_admit(memory, &self.eviction_source()).await
-                == AdmissionDecision::Admit
+            if let Some(grant) = admission
+                .try_admit_grant(memory, &self.eviction_source())
+                .await
             {
-                return;
+                return grant;
             }
             debug!("Measured headroom insufficient for {memory}, backing off and retrying");
             tokio::time::sleep(self.acquire_retry_delay).await;
@@ -271,33 +276,28 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
     }
 
     /// Non-blocking memory admission for a growing worker. A single gate attempt:
-    /// returns `true` when the grow is admitted, `false` when real headroom is
-    /// insufficient even after eviction (the caller turns this into a retriable
-    /// out-of-memory trap). With measured admission disabled the grow is always
-    /// admitted.
-    pub async fn try_acquire(&self, memory: u64) -> bool {
+    /// returns the additional [`MemoryGrant`] when the grow is admitted, or `None`
+    /// when real headroom is insufficient even after eviction (the caller turns
+    /// `None` into a retriable out-of-memory trap). The returned grant should be
+    /// merged into the worker's existing grant so its whole reservation is
+    /// released together on unload. With measured admission disabled the grow is
+    /// always admitted with an inert grant.
+    pub(crate) async fn try_acquire(&self, memory: u64) -> Option<MemoryGrant> {
         let Some(admission) = &self.admission else {
-            return true;
+            return Some(MemoryGrant::inert());
         };
-        match admission.try_admit(memory, &self.eviction_source()).await {
-            AdmissionDecision::Admit => true,
-            AdmissionDecision::Reject => {
+        match admission
+            .try_admit_grant(memory, &self.eviction_source())
+            .await
+        {
+            Some(grant) => Some(grant),
+            None => {
                 debug!("Measured headroom insufficient for {memory}, not admitting");
-                false
+                None
             }
         }
     }
 
-    /// Release the memory a worker reserved with the admission gate when it
-    /// unloads. `bytes` must be the cumulative amount the worker reserved through
-    /// [`Self::acquire`] and [`Self::try_acquire`], so the gate's granted total
-    /// stays symmetric. No-op when measured admission is disabled.
-    pub fn release_memory(&self, bytes: u64) {
-        if let Some(admission) = &self.admission {
-            admission.release(bytes);
-        }
-    }
-
     /// Blocking acquire of storage semaphore permits. Loops until the requested
     /// number of bytes is available, evicting idle workers as needed.
     pub async fn acquire_filesystem_storage(&self, storage_bytes: u64) -> FilesystemStoragePermit {
diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs
index 1f6c8313cf..66017602a3 100644
--- a/golem-worker-executor/src/services/active_workers/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/tests.rs
@@ -1060,3 +1060,225 @@ mod scheduler_liveness {
         Ok(())
     }
 }
+
+// ── Grant-guard liveness under random churn ──────────────────────────────────
+//
+// A worker's memory grant is reserved with the admission gate and then owned by
+// a guard that lives in one of three places over the worker's lifetime: in the
+// in-flight start task (waiting for permits), in the resident worker (started),
+// or dropped (the worker exited or its start was cancelled). The liveness
+// invariant — mirroring `scheduler_liveness` for the concurrent-agents scheduler
+// — is that however the guard travels between those places, the gate's
+// accounting stays symmetric: once every guard is gone, admissible headroom
+// returns to the full ceiling. A reservation released zero times (leak, the
+// cancelled-while-waiting deletion bug) or more than once (double-release) breaks
+// it. With a zero-usage probe, headroom is `ceiling - granted`, so the final
+// headroom reads the granted total directly.
+mod grant_guard_liveness {
+    use super::super::admission::{
+        AdmissionController, AdmissionPolicy, EvictionPriority, EvictionSource, MemoryGrant,
+    };
+    use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot};
+    use proptest::prelude::*;
+    use std::sync::Arc;
+    use std::time::Duration;
+    use test_r::test;
+    use tokio::task::JoinHandle;
+
+    /// Probe with a fixed limit reporting zero resident usage, so admissible
+    /// headroom equals `ceiling - granted` and reads the granted accounting
+    /// directly — the quantity a leaked or double-released grant corrupts.
+    #[derive(Debug)]
+    struct ZeroUsageProbe {
+        limit: u64,
+    }
+
+    impl MemoryProbe for ZeroUsageProbe {
+        fn snapshot(&self) -> MemorySnapshot {
+            MemorySnapshot {
+                limit_bytes: self.limit,
+                current_bytes: 0,
+            }
+        }
+    }
+
+    /// Nothing to evict: a rejected request stays rejected (the schedule keeps
+    /// total grants within the ceiling so admission only fails transiently, never
+    /// due to a leak the gate could not see).
+    struct NoEvictionSource;
+
+    #[async_trait::async_trait]
+    impl EvictionSource for NoEvictionSource {
+        async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 {
+            0
+        }
+    }
+
+    /// One step in a randomized grant-lifecycle workload.
+    #[derive(Debug, Clone)]
+    enum Op {
+        /// Begin a worker start: spawn a task that acquires a grant of this many
+        /// bytes and then parks holding it, as a worker waits for its remaining
+        /// permits before becoming resident.
+        Start(u64),
+        /// A still-in-flight start becomes resident: its task yields the grant
+        /// guard, which we keep (the worker is now running).
+        Resident(prop::sample::Index),
+        /// Cancel a still-in-flight start, as deleting a waiting worker does:
+        /// abort the task, dropping the grant guard it held.
+        CancelStart(prop::sample::Index),
+        /// A resident worker exits: drop its grant guard.
+        Exit(prop::sample::Index),
+    }
+
+    fn arb_ops() -> impl Strategy<Value = Vec<Op>> {
+        prop::collection::vec(
+            prop_oneof![
+                4 => (1u64..50).prop_map(Op::Start),
+                2 => any::<prop::sample::Index>().prop_map(Op::Resident),
+                3 => any::<prop::sample::Index>().prop_map(Op::CancelStart),
+                2 => any::<prop::sample::Index>().prop_map(Op::Exit),
+            ],
+            1..80,
+        )
+    }
+
+    /// An in-flight start: the task acquires the grant, then sends it back over
+    /// `ready` and parks, so the driver can either take the grant (the worker
+    /// became resident) or abort the task (the start was cancelled, dropping the
+    /// grant inside the task).
+    struct InFlight {
+        handle: JoinHandle<()>,
+        ready: tokio::sync::oneshot::Receiver<MemoryGrant>,
+    }
+
+    /// Drive one randomized workload and assert headroom recovers to the ceiling
+    /// once every grant guard is gone.
+    async fn run_workload(limit: u64, ops: Vec<Op>) -> Result<(), TestCaseError> {
+        let controller = Arc::new(AdmissionController::new(
+            Box::new(ZeroUsageProbe { limit }),
+            AdmissionPolicy { usable_ratio: 1.0 },
+        ));
+
+        let mut in_flight: Vec<InFlight> = Vec::new();
+        let mut resident: Vec<MemoryGrant> = Vec::new();
+
+        for op in ops {
+            match op {
+                Op::Start(bytes) => {
+                    let controller = controller.clone();
+                    let (tx, rx) = tokio::sync::oneshot::channel();
+                    let handle = tokio::spawn(async move {
+                        if let Some(grant) =
+                            controller.try_admit_grant(bytes, &NoEvictionSource).await
+                        {
+                            // Hand the grant to the driver, then park holding the
+                            // task alive so an abort drops the guard if the driver
+                            // never took it.
+                            let _ = tx.send(grant);
+                        }
+                        std::future::pending::<()>().await;
+                    });
+                    in_flight.push(InFlight { handle, ready: rx });
+                }
+                Op::Resident(idx) => {
+                    if !in_flight.is_empty() {
+                        let i = idx.index(in_flight.len());
+                        let started = in_flight.remove(i);
+                        // Take the grant out of the task (worker is now resident),
+                        // then abort the now-grantless parked task.
+                        match started.ready.await {
+                            Ok(grant) => {
+                                resident.push(grant);
+                                started.handle.abort();
+                                let _ = started.handle.await;
+                            }
+                            Err(_) => {
+                                // Admission was rejected; nothing was granted.
+                                started.handle.abort();
+                                let _ = started.handle.await;
+                            }
+                        }
+                    }
+                }
+                Op::CancelStart(idx) => {
+                    if !in_flight.is_empty() {
+                        let i = idx.index(in_flight.len());
+                        let started = in_flight.remove(i);
+                        // Delete a waiting worker: abort mid-flight. If the task
+                        // had already acquired its grant, the guard is dropped
+                        // inside the aborted task.
+                        started.handle.abort();
+                        let _ = started.handle.await;
+                    }
+                }
+                Op::Exit(idx) => {
+                    if !resident.is_empty() {
+                        let i = idx.index(resident.len());
+                        drop(resident.remove(i));
+                    }
+                }
+            }
+            // Let acquires/aborts settle so the granted accounting is observable.
+            for _ in 0..4 {
+                tokio::task::yield_now().await;
+            }
+        }
+
+        // Tear everything down: abort remaining starts, drop remaining resident
+        // grants. The environment is now empty.
+        for started in in_flight.drain(..) {
+            started.handle.abort();
+            let _ = started.handle.await;
+        }
+        resident.clear();
+        // Let the final drops' releases settle.
+        tokio::time::sleep(Duration::from_millis(20)).await;
+
+        let headroom = controller.headroom_bytes();
+        prop_assert_eq!(
+            headroom,
+            limit,
+            "headroom did not recover to ceiling {} after all grants were released (got {}); \
+             a grant leaked or was double-released across the lifecycle",
+            limit,
+            headroom
+        );
+
+        // And the gate must be live again: a fresh full-ceiling admission fits.
+        let readmit = controller.try_admit_grant(limit, &NoEvictionSource).await;
+        prop_assert!(
+            readmit.is_some(),
+            "gate refused a full-ceiling admission after draining; headroom is wedged"
+        );
+        Ok(())
+    }
+
+    proptest! {
+        #![proptest_config(ProptestConfig { cases: 128, max_shrink_iters: 64, ..ProptestConfig::default() })]
+
+        /// Liveness: under any interleaving of start / become-resident /
+        /// cancel-start / exit, once every grant guard is gone the gate's
+        /// admissible headroom returns to the full ceiling and admits again. A
+        /// grant that leaks on cancellation (or is released twice) breaks this.
+        #[test]
+        fn grants_never_leak_under_random_churn(
+            limit in 200u64..4000,
+            ops in arb_ops(),
+        ) {
+            let rt = tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(4)
+                .enable_time()
+                .build()
+                .unwrap();
+
+            rt.block_on(async move {
+                tokio::time::timeout(Duration::from_secs(10), run_workload(limit, ops))
+                    .await
+                    .unwrap_or_else(|_| Err(TestCaseError::fail(
+                        "grant workload did not complete within the timeout",
+                    )))
+            })?;
+        }
+    }
+}
diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs
index 946b20dae6..7e6ca1a298 100644
--- a/golem-worker-executor/src/services/golem_config.rs
+++ b/golem-worker-executor/src/services/golem_config.rs
@@ -988,7 +988,9 @@ impl MemoryConfig {
     /// The admission policy for the measured-headroom gate. Reuses
     /// `worker_memory_ratio` as the usable fraction of the measured limit (the
     /// host keeps the remainder).
-    pub fn admission_policy(&self) -> crate::services::active_workers::admission::AdmissionPolicy {
+    pub(crate) fn admission_policy(
+        &self,
+    ) -> crate::services::active_workers::admission::AdmissionPolicy {
         crate::services::active_workers::admission::AdmissionPolicy {
             usable_ratio: self.worker_memory_ratio,
         }
diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs
index 69dd1c769f..23ba710f81 100644
--- a/golem-worker-executor/src/worker/mod.rs
+++ b/golem-worker-executor/src/worker/mod.rs
@@ -27,7 +27,7 @@ use crate::durable_host::recover_stderr_logs;
 use crate::metrics::storage::record_filesystem_pool_released;
 use crate::model::{AgentConfig, ExecutionStatus, LookupResult, ReadFileResult, TrapType};
 use crate::services::active_workers::{
-    FilesystemStoragePermit, HeldComponentCharge, RegisteredConcurrentAccount,
+    FilesystemStoragePermit, HeldComponentCharge, MemoryGrant, RegisteredConcurrentAccount,
     WorkerComponentCharge,
 };
 use crate::services::events::{Event, EventsSubscription};
@@ -137,11 +137,6 @@ pub struct Worker<Ctx: WorkerCtx> {
     /// at least that many bytes from the blocking eviction path, ensuring
     /// enough idle workers are evicted to satisfy the pending write.
     desired_extra_filesystem_storage: AtomicU64,
-    /// Cumulative memory bytes this worker has reserved with the admission gate:
-    /// its initial requirement plus every grow delta. Released back to the gate
-    /// in full when the worker unloads, so the gate's granted total stays exactly
-    /// symmetric with what was reserved.
-    granted_memory: AtomicU64,
 }
 
 impl<Ctx: WorkerCtx> HasOplog for Worker<Ctx> {
@@ -354,7 +349,6 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             last_resume_request: Mutex::new(Timestamp::now_utc()),
             snapshot_recovery_disabled: AtomicBool::new(false),
             desired_extra_filesystem_storage: AtomicU64::new(0),
-            granted_memory: AtomicU64::new(0),
         };
 
         // Wire the worker event service into the forwarding oplog so plugin errors
@@ -1004,24 +998,26 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             | WorkerInstance::Deleting => return Ok(()),
         }
 
-        if self.active_workers().try_acquire(delta).await {
-            self.granted_memory.fetch_add(delta, Ordering::Relaxed);
-            Ok(())
-        } else {
+        let Some(extra_grant) = self.active_workers().try_acquire(delta).await else {
             crate::metrics::workers::record_worker_memory_grow_rejected();
-            Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory))
-        }
-    }
+            return Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory));
+        };
 
-    /// Release this worker's entire accumulated memory grant back to the
-    /// admission gate, resetting the running total to zero. Called when the
-    /// worker stops being resident; a later reload re-accumulates the grant from
-    /// scratch through the acquire path.
-    fn release_granted_memory(&self) {
-        let granted = self.granted_memory.swap(0, Ordering::Relaxed);
-        if granted > 0 {
-            self.active_workers().release_memory(granted);
+        // Re-check state under the lock: the worker may have changed state while
+        // the gate ran. If it is still running, merge the extra grant into the
+        // running worker so its whole reservation releases together on unload.
+        // Otherwise drop `extra_grant` here, returning the reservation to the
+        // gate, and treat the grow as a no-op (matching the non-running arms).
+        match &mut *self.instance.lock().await {
+            WorkerInstance::Running(running) => {
+                running.merge_extra_memory_grant(extra_grant);
+            }
+            WorkerInstance::Stopping(_)
+            | WorkerInstance::WaitingForPermit(_)
+            | WorkerInstance::Unloaded { .. }
+            | WorkerInstance::Deleting => {}
         }
+        Ok(())
     }
 
     /// Return `freed_bytes` to the storage semaphore pool.
@@ -1672,15 +1668,18 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
                 // when stopping via the invocation loop we can stop immediately, no need to go via the stopping status
                 if called_from_invocation_loop {
                     crate::metrics::workers::dec_worker_memory_resident();
-                    self.release_granted_memory();
+                    // Dropping `running` at the end of this arm releases its
+                    // memory grant (and component/storage permits) back to the
+                    // gate.
                     **instance_guard = final_state.into_instance();
                     StopResult::Stopped
                 } else {
                     // drop the running worker, this signals to the invocation loop to start exiting.
+                    // `stop()` consumes the RunningWorker and drops everything but
+                    // its join handle, releasing its memory grant back to the gate.
                     let run_loop_handle = running.stop();
                     let notify = OneShotEvent::new();
                     crate::metrics::workers::dec_worker_memory_resident();
-                    self.release_granted_memory();
                     **instance_guard = WorkerInstance::Stopping(StoppingWorker {
                         notify: notify.clone(),
                         final_state,
@@ -2229,6 +2228,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
 
     async fn start_waiting_worker(
         this: Arc<Worker<Ctx>>,
+        memory_grant: MemoryGrant,
         component_charge: WorkerComponentCharge,
         filesystem_storage_permit: Option<FilesystemStoragePermit>,
         concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit,
@@ -2244,6 +2244,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
                     this.owned_agent_id.clone(),
                     this.queue.clone(),
                     this.clone(),
+                    memory_grant,
                     component_charge,
                     concurrent_agent_permit,
                     oom_retry_count,
@@ -2258,9 +2259,8 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             }
             _ => {
                 debug!("worker was not waiting for permit anymore, not starting");
-                // The grant was reserved before this call; the worker is not
-                // becoming resident, so release it rather than leak it.
-                this.release_granted_memory();
+                // The worker is not becoming resident: dropping `memory_grant`
+                // here returns its reservation to the gate.
             }
         }
     }
@@ -2401,10 +2401,15 @@ impl WaitingWorker {
                 // Do not gate executor memory while waiting for a per-account
                 // concurrency slot. Otherwise one account could exhaust the
                 // memory headroom with workers that are not allowed to run yet.
-                parent.active_workers().acquire(memory_requirement).await;
-                parent
-                    .granted_memory
-                    .fetch_add(memory_requirement, Ordering::Relaxed);
+                //
+                // `memory_grant` owns the reservation from here on: it is held as
+                // a local until the worker becomes resident (when it moves into
+                // the RunningWorker) or this task ends/aborts (when dropping it
+                // returns the reservation to the gate). This is what makes a
+                // start cancelled mid-flight — e.g. the worker being deleted while
+                // still waiting for its remaining permits — release rather than
+                // leak its grant.
+                let memory_grant = parent.active_workers().acquire(memory_requirement).await;
                 // Reserve the component's compiled module size once per resident
                 // component (shared by all its workers). Held for as long as this
                 // worker is resident; the module faults into RAM when the first
@@ -2424,7 +2429,7 @@ impl WaitingWorker {
                         warn!(
                             "Failed to determine component charge requirement, not starting: {err}"
                         );
-                        parent.release_granted_memory();
+                        // Dropping `memory_grant` here returns its reservation.
                         return;
                     }
                 };
@@ -2478,6 +2483,7 @@ impl WaitingWorker {
                 debug!("Attempting to start worker after acquiring enough permits");
                 Worker::start_waiting_worker(
                     parent,
+                    memory_grant,
                     component_charge,
                     filesystem_storage_permit,
                     concurrent_agent_permit,
@@ -2510,6 +2516,13 @@ struct RunningWorker {
     handle: Option<JoinHandle<()>>,
     sender: UnboundedSender<WorkerCommand>,
     queue: Arc<RwLock<VecDeque<QueuedWorkerInvocation>>>,
+    /// The worker's memory reservation with the admission gate, covering its
+    /// initial requirement plus any grow deltas merged in. Held only to be
+    /// dropped: dropping it (on stop, eviction, or this worker being dropped for
+    /// any reason) returns the reservation to the gate, keeping the granted total
+    /// symmetric with what was reserved.
+    #[allow(dead_code)]
+    memory_grant: MemoryGrant,
     /// Keeps this worker's component module charge alive while it is resident.
     /// Held only to be dropped: dropping it releases the component's residency
     /// (and the module reservation if this was the last worker of the component).
@@ -2545,6 +2558,7 @@ impl RunningWorker {
         owned_agent_id: OwnedAgentId,
         queue: Arc<RwLock<VecDeque<QueuedWorkerInvocation>>>,
         parent: Arc<Worker<Ctx>>,
+        memory_grant: MemoryGrant,
         component_charge: WorkerComponentCharge,
         concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit,
         oom_retry_count: u32,
@@ -2595,6 +2609,7 @@ impl RunningWorker {
             handle: Some(handle),
             sender,
             queue,
+            memory_grant,
             component_charge: Box::new(component_charge),
             filesystem_storage_permit: None,
             waiting_for_command,
@@ -2603,6 +2618,13 @@ impl RunningWorker {
         }
     }
 
+    /// Merge an additional memory grant (from a successful grow) into this
+    /// worker's grant, so its whole reservation is released together when the
+    /// worker unloads.
+    pub fn merge_extra_memory_grant(&mut self, extra: MemoryGrant) {
+        self.memory_grant.merge(extra);
+    }
+
     /// Merge additional storage permits into this worker's storage permit. If
     /// the worker does not yet hold a storage permit, the given permit becomes
     /// the initial one. Additional calls merge into that initial permit.

From 4f4a5cae343a460246447dc28e513408b598e6e6 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sun, 14 Jun 2026 14:52:32 -0700
Subject: [PATCH 58/60] feat: move tests around

---
 .../services/active_workers/admission/mod.rs  | 18 +++---
 .../src/services/active_workers/mod.rs        | 10 +---
 .../src/services/active_workers/tests.rs      | 60 +++++++++----------
 3 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs
index 02175c11ee..ec57acd699 100644
--- a/golem-worker-executor/src/services/active_workers/admission/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs
@@ -256,14 +256,16 @@ impl AdmissionController {
         self.admissible_headroom()
     }
 
-    /// Like [`Self::try_admit`], but on admit returns a [`MemoryGrant`] guard
-    /// that owns the reservation and releases it on drop. The grant a starting
-    /// worker holds passes through several `.await` points before the worker
-    /// becomes resident (per-account concurrency, component charge, filesystem
-    /// storage); if that work is cancelled — as when the worker is deleted while
-    /// still waiting — the guard's drop returns the reservation, so a cancelled
-    /// start cannot leak headroom.
-    pub(crate) async fn try_admit_grant(
+    /// Admit `request_bytes`, evicting resident idle-then-warm work if needed,
+    /// and on success return a [`MemoryGrant`] guard that owns the reservation
+    /// and releases it on drop; `None` if the request cannot be admitted.
+    ///
+    /// The grant a starting worker holds passes through several `.await` points
+    /// before the worker becomes resident (per-account concurrency, component
+    /// charge, filesystem storage); if that work is cancelled — as when the
+    /// worker is deleted while still waiting — the guard's drop returns the
+    /// reservation, so a cancelled start cannot leak headroom.
+    pub(crate) async fn admit(
         self: &Arc<Self>,
         request_bytes: u64,
         source: &dyn EvictionSource,
diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs
index 9cccc56112..24784065b4 100644
--- a/golem-worker-executor/src/services/active_workers/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/mod.rs
@@ -256,10 +256,7 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
         loop {
             // Evicts idle-then-warm when real headroom is short; rejects (and we
             // back off) when it cannot make room rather than risking the limit.
-            if let Some(grant) = admission
-                .try_admit_grant(memory, &self.eviction_source())
-                .await
-            {
+            if let Some(grant) = admission.admit(memory, &self.eviction_source()).await {
                 return grant;
             }
             debug!("Measured headroom insufficient for {memory}, backing off and retrying");
@@ -286,10 +283,7 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
         let Some(admission) = &self.admission else {
             return Some(MemoryGrant::inert());
         };
-        match admission
-            .try_admit_grant(memory, &self.eviction_source())
-            .await
-        {
+        match admission.admit(memory, &self.eviction_source()).await {
             Some(grant) => Some(grant),
             None => {
                 debug!("Measured headroom insufficient for {memory}, not admitting");
diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs
index 66017602a3..217c0e21b6 100644
--- a/golem-worker-executor/src/services/active_workers/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/tests.rs
@@ -1143,13 +1143,15 @@ mod grant_guard_liveness {
         )
     }
 
-    /// An in-flight start: the task acquires the grant, then sends it back over
-    /// `ready` and parks, so the driver can either take the grant (the worker
-    /// became resident) or abort the task (the start was cancelled, dropping the
-    /// grant inside the task).
+    /// An in-flight start: the task runs admission, reports the outcome back over
+    /// `ready` (the grant on admit, `None` if the gate rejected it), then parks
+    /// holding the grant. The driver can take the grant (the worker became
+    /// resident) or abort the task (the start was cancelled, dropping any grant
+    /// inside the task). The outcome is always reported, so the driver never
+    /// blocks waiting on a start that was rejected.
     struct InFlight {
         handle: JoinHandle<()>,
-        ready: tokio::sync::oneshot::Receiver<MemoryGrant>,
+        ready: tokio::sync::oneshot::Receiver<Option<MemoryGrant>>,
     }
 
     /// Drive one randomized workload and assert headroom recovers to the ceiling
@@ -1169,14 +1171,15 @@ mod grant_guard_liveness {
                     let controller = controller.clone();
                     let (tx, rx) = tokio::sync::oneshot::channel();
                     let handle = tokio::spawn(async move {
-                        if let Some(grant) =
-                            controller.try_admit_grant(bytes, &NoEvictionSource).await
-                        {
-                            // Hand the grant to the driver, then park holding the
-                            // task alive so an abort drops the guard if the driver
-                            // never took it.
-                            let _ = tx.send(grant);
-                        }
+                        // Always report the admission outcome so the driver never
+                        // blocks on a start that was rejected. On admit the grant
+                        // travels to the driver (held in the channel until taken
+                        // as resident or dropped on cancel); on reject we report
+                        // `None`.
+                        let outcome = controller.admit(bytes, &NoEvictionSource).await;
+                        let _ = tx.send(outcome);
+                        // Park so the task stays alive until the driver decides
+                        // its fate (become resident, or be aborted on cancel).
                         std::future::pending::<()>().await;
                     });
                     in_flight.push(InFlight { handle, ready: rx });
@@ -1185,31 +1188,28 @@ mod grant_guard_liveness {
                     if !in_flight.is_empty() {
                         let i = idx.index(in_flight.len());
                         let started = in_flight.remove(i);
-                        // Take the grant out of the task (worker is now resident),
-                        // then abort the now-grantless parked task.
-                        match started.ready.await {
-                            Ok(grant) => {
-                                resident.push(grant);
-                                started.handle.abort();
-                                let _ = started.handle.await;
-                            }
-                            Err(_) => {
-                                // Admission was rejected; nothing was granted.
-                                started.handle.abort();
-                                let _ = started.handle.await;
-                            }
+                        // Becoming resident requires the start to have been
+                        // admitted. Take the grant if there is one (worker is now
+                        // running); a rejected start cannot become resident and is
+                        // simply discarded. Either way abort the parked task.
+                        if let Ok(Some(grant)) = started.ready.await {
+                            resident.push(grant);
                         }
+                        started.handle.abort();
+                        let _ = started.handle.await;
                     }
                 }
                 Op::CancelStart(idx) => {
                     if !in_flight.is_empty() {
                         let i = idx.index(in_flight.len());
                         let started = in_flight.remove(i);
-                        // Delete a waiting worker: abort mid-flight. If the task
-                        // had already acquired its grant, the guard is dropped
-                        // inside the aborted task.
+                        // Delete a waiting worker: abort the task and drop the
+                        // `InFlight`. Any grant the start acquired is held in
+                        // `started.ready`; dropping it returns the reservation,
+                        // exactly as aborting a waiting worker mid-flight does.
                         started.handle.abort();
                         let _ = started.handle.await;
+                        drop(started.ready);
                     }
                 }
                 Op::Exit(idx) => {
@@ -1246,7 +1246,7 @@ mod grant_guard_liveness {
         );
 
         // And the gate must be live again: a fresh full-ceiling admission fits.
-        let readmit = controller.try_admit_grant(limit, &NoEvictionSource).await;
+        let readmit = controller.admit(limit, &NoEvictionSource).await;
         prop_assert!(
             readmit.is_some(),
             "gate refused a full-ceiling admission after draining; headroom is wedged"

From 4cfd2e714d5f686d26fe64dc7702b47984766fad Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sun, 14 Jun 2026 16:36:35 -0700
Subject: [PATCH 59/60] debug: add more logging to figure out why we leak
 memory and deadlock

---
 golem-worker-executor/src/worker/mod.rs | 51 ++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs
index 23ba710f81..9601fd7a47 100644
--- a/golem-worker-executor/src/worker/mod.rs
+++ b/golem-worker-executor/src/worker/mod.rs
@@ -1702,7 +1702,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
                 run_loop_handle,
                 notify,
             } => {
-                run_loop_handle.await.expect("Failed to join run loop");
+                join_run_loop_with_watchdog(&self.owned_agent_id, run_loop_handle).await;
 
                 let mut instance_guard = self.instance.lock().await;
                 let is_deleting = match &*instance_guard {
@@ -2308,6 +2308,37 @@ impl Drop for WorkerStatusMetric {
     }
 }
 
+/// Joins the invocation-loop task, logging a periodic warning if the join does
+/// not complete promptly.
+///
+/// The stop path drops the worker's command channel sender and then waits here
+/// for the loop task to observe the close and return. If the loop is parked at
+/// an await that never observes the closed channel, this join never completes
+/// and the delete/stop that triggered it is wedged. The watchdog surfaces that
+/// condition with the agent id so a stuck teardown is diagnosable from logs.
+async fn join_run_loop_with_watchdog(owned_agent_id: &OwnedAgentId, handle: JoinHandle<()>) {
+    const WARN_AFTER: Duration = Duration::from_secs(10);
+
+    let mut handle = handle;
+    let mut waited = Duration::ZERO;
+    loop {
+        match tokio::time::timeout(WARN_AFTER, &mut handle).await {
+            Ok(join_result) => {
+                join_result.expect("Failed to join run loop");
+                return;
+            }
+            Err(_) => {
+                waited += WARN_AFTER;
+                warn!(
+                    agent_id = %owned_agent_id,
+                    waited_secs = waited.as_secs(),
+                    "Still waiting for invocation loop to exit during stop; the loop task may be parked at an uninterruptible await"
+                );
+            }
+        }
+    }
+}
+
 pub fn merge_agent_env_with_default_env(
     agent_env: Option<Vec<(String, String)>>,
     default_agent_env: BTreeMap<String, String>,
@@ -2514,6 +2545,7 @@ impl Drop for WaitingWorker {
 #[derive(Debug)]
 struct RunningWorker {
     handle: Option<JoinHandle<()>>,
+    owned_agent_id: OwnedAgentId,
     sender: UnboundedSender<WorkerCommand>,
     queue: Arc<RwLock<VecDeque<QueuedWorkerInvocation>>>,
     /// The worker's memory reservation with the admission gate, covering its
@@ -2550,6 +2582,19 @@ impl Drop for RunningWorker {
                 record_filesystem_pool_released(bytes);
             }
         }
+        // A `RunningWorker` is normally torn down via `stop()`, which takes the
+        // handle so the invocation loop can be joined after its command channel
+        // closes. If the handle is still present here and the task has not
+        // finished, the loop task is being orphaned: it will keep running and
+        // hold its wasmtime `Store` even though the worker is gone.
+        if let Some(handle) = &self.handle
+            && !handle.is_finished()
+        {
+            warn!(
+                agent_id = %self.owned_agent_id,
+                "RunningWorker dropped while its invocation loop task is still running; the loop task is being orphaned and will leak its Store"
+            );
+        }
     }
 }
 
@@ -2568,6 +2613,7 @@ impl RunningWorker {
 
         let active_clone = queue.clone();
         let owned_agent_id_clone = owned_agent_id.clone();
+        let owned_agent_id_log = owned_agent_id.clone();
         let waiting_for_command = Arc::new(AtomicBool::new(false));
         let waiting_for_command_clone = waiting_for_command.clone();
         let interrupt_signal = Arc::new(async_lock::Mutex::new(None));
@@ -2588,6 +2634,7 @@ impl RunningWorker {
         );
         let handle = tokio::task::spawn(
             async move {
+                debug!(agent_id = %owned_agent_id_log, "Invocation loop task started");
                 RunningWorker::invocation_loop(
                     receiver,
                     active_clone,
@@ -2601,12 +2648,14 @@ impl RunningWorker {
                 )
                 .instrument(span)
                 .await;
+                debug!(agent_id = %owned_agent_id_log, "Invocation loop task exited");
             }
             .in_current_span(),
         );
 
         RunningWorker {
             handle: Some(handle),
+            owned_agent_id,
             sender,
             queue,
             memory_grant,

From faa6a2aa1c3079d301caa21341aee9e4694a9052 Mon Sep 17 00:00:00 2001
From: kmatasfp <33095685+kmatas@users.noreply.github.com>
Date: Sun, 14 Jun 2026 21:30:55 -0700
Subject: [PATCH 60/60] debug: observe if Store references are kept alive

---
 golem-worker-executor/src/durable_host/mod.rs | 22 ++++++++
 golem-worker-executor/src/metrics.rs          | 24 +++++++++
 golem-worker-executor/src/worker/mod.rs       | 51 +------------------
 3 files changed, 47 insertions(+), 50 deletions(-)

diff --git a/golem-worker-executor/src/durable_host/mod.rs b/golem-worker-executor/src/durable_host/mod.rs
index 6b134d87fb..01052d8600 100644
--- a/golem-worker-executor/src/durable_host/mod.rs
+++ b/golem-worker-executor/src/durable_host/mod.rs
@@ -249,6 +249,27 @@ pub struct DurableWorkerCtx<Ctx: WorkerCtx> {
     execution_status: Arc<RwLock<ExecutionStatus>>,
     pub websocket_connection_pool: websocket::WebSocketConnectionPool,
     resource_limits: Arc<AtomicResourceEntry>,
+    _store_alive_guard: StoreAliveGuard,
+}
+
+/// Increments the live-`Store` gauge on construction and decrements it on drop.
+/// Held as a field of [`DurableWorkerCtx`], which is the data of the wasmtime
+/// `Store`, so the gauge follows the `Store`'s true lifetime regardless of which
+/// reference keeps it alive. A persistent gap above the resident-worker count
+/// indicates `Store`s retained after their worker was deleted.
+struct StoreAliveGuard;
+
+impl StoreAliveGuard {
+    fn new() -> Self {
+        crate::metrics::workers::inc_worker_store_alive();
+        StoreAliveGuard
+    }
+}
+
+impl Drop for StoreAliveGuard {
+    fn drop(&mut self) {
+        crate::metrics::workers::dec_worker_store_alive();
+    }
 }
 
 impl<Ctx: WorkerCtx> DurableWorkerCtx<Ctx> {
@@ -476,6 +497,7 @@ impl<Ctx: WorkerCtx> DurableWorkerCtx<Ctx> {
             worker_dir,
             execution_status,
             resource_limits,
+            _store_alive_guard: StoreAliveGuard::new(),
         })
     }
 
diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs
index 980ae3842d..3b4d976f6e 100644
--- a/golem-worker-executor/src/metrics.rs
+++ b/golem-worker-executor/src/metrics.rs
@@ -260,6 +260,12 @@ pub mod workers {
             &["executor_id"]
         )
         .unwrap();
+        pub static ref WORKER_STORE_ALIVE_COUNT: GaugeVec = register_gauge_vec!(
+            "golem_worker_store_alive_count",
+            "Live wasmtime Store contexts on this executor, counted by the Store's own lifetime: incremented when a worker's Store is constructed and decremented when it is dropped. Diverging above the resident-worker count means Stores are retained after the owning worker was deleted",
+            &["executor_id"]
+        )
+        .unwrap();
         pub static ref WORKER_KV_CACHE_VALUE_SIZE_BYTES: HistogramVec = register_histogram_vec!(
             "worker_kv_cache_value_size_bytes",
             "Bytes of a value written to the Worker-namespace KV cache (worker status blob size)",
@@ -361,6 +367,7 @@ pub mod workers {
         WORKER_WAITING_FOR_MEMORY_COUNT
             .with_label_values(&[id])
             .set(0.0);
+        WORKER_STORE_ALIVE_COUNT.with_label_values(&[id]).set(0.0);
         WORKER_MEMORY_GROW_REJECTED_TOTAL
             .with_label_values(&[id])
             .inc_by(0.0);
@@ -378,6 +385,23 @@ pub mod workers {
             .dec();
     }
 
+    /// Incremented when a worker's wasmtime `Store` context is constructed.
+    /// Paired with [`dec_worker_store_alive`] from a guard dropped with the
+    /// `Store` itself, so the gauge tracks the `Store`'s true lifetime rather
+    /// than the owning worker's accounting.
+    pub fn inc_worker_store_alive() {
+        WORKER_STORE_ALIVE_COUNT
+            .with_label_values(&[crate::metrics::storage::executor_id()])
+            .inc();
+    }
+
+    /// Decremented when a worker's wasmtime `Store` context is dropped.
+    pub fn dec_worker_store_alive() {
+        WORKER_STORE_ALIVE_COUNT
+            .with_label_values(&[crate::metrics::storage::executor_id()])
+            .dec();
+    }
+
     pub fn inc_worker_waiting_for_memory() {
         WORKER_WAITING_FOR_MEMORY_COUNT
             .with_label_values(&[crate::metrics::storage::executor_id()])
diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs
index 9601fd7a47..23ba710f81 100644
--- a/golem-worker-executor/src/worker/mod.rs
+++ b/golem-worker-executor/src/worker/mod.rs
@@ -1702,7 +1702,7 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
                 run_loop_handle,
                 notify,
             } => {
-                join_run_loop_with_watchdog(&self.owned_agent_id, run_loop_handle).await;
+                run_loop_handle.await.expect("Failed to join run loop");
 
                 let mut instance_guard = self.instance.lock().await;
                 let is_deleting = match &*instance_guard {
@@ -2308,37 +2308,6 @@ impl Drop for WorkerStatusMetric {
     }
 }
 
-/// Joins the invocation-loop task, logging a periodic warning if the join does
-/// not complete promptly.
-///
-/// The stop path drops the worker's command channel sender and then waits here
-/// for the loop task to observe the close and return. If the loop is parked at
-/// an await that never observes the closed channel, this join never completes
-/// and the delete/stop that triggered it is wedged. The watchdog surfaces that
-/// condition with the agent id so a stuck teardown is diagnosable from logs.
-async fn join_run_loop_with_watchdog(owned_agent_id: &OwnedAgentId, handle: JoinHandle<()>) {
-    const WARN_AFTER: Duration = Duration::from_secs(10);
-
-    let mut handle = handle;
-    let mut waited = Duration::ZERO;
-    loop {
-        match tokio::time::timeout(WARN_AFTER, &mut handle).await {
-            Ok(join_result) => {
-                join_result.expect("Failed to join run loop");
-                return;
-            }
-            Err(_) => {
-                waited += WARN_AFTER;
-                warn!(
-                    agent_id = %owned_agent_id,
-                    waited_secs = waited.as_secs(),
-                    "Still waiting for invocation loop to exit during stop; the loop task may be parked at an uninterruptible await"
-                );
-            }
-        }
-    }
-}
-
 pub fn merge_agent_env_with_default_env(
     agent_env: Option<Vec<(String, String)>>,
     default_agent_env: BTreeMap<String, String>,
@@ -2545,7 +2514,6 @@ impl Drop for WaitingWorker {
 #[derive(Debug)]
 struct RunningWorker {
     handle: Option<JoinHandle<()>>,
-    owned_agent_id: OwnedAgentId,
     sender: UnboundedSender<WorkerCommand>,
     queue: Arc<RwLock<VecDeque<QueuedWorkerInvocation>>>,
     /// The worker's memory reservation with the admission gate, covering its
@@ -2582,19 +2550,6 @@ impl Drop for RunningWorker {
                 record_filesystem_pool_released(bytes);
             }
         }
-        // A `RunningWorker` is normally torn down via `stop()`, which takes the
-        // handle so the invocation loop can be joined after its command channel
-        // closes. If the handle is still present here and the task has not
-        // finished, the loop task is being orphaned: it will keep running and
-        // hold its wasmtime `Store` even though the worker is gone.
-        if let Some(handle) = &self.handle
-            && !handle.is_finished()
-        {
-            warn!(
-                agent_id = %self.owned_agent_id,
-                "RunningWorker dropped while its invocation loop task is still running; the loop task is being orphaned and will leak its Store"
-            );
-        }
     }
 }
 
@@ -2613,7 +2568,6 @@ impl RunningWorker {
 
         let active_clone = queue.clone();
         let owned_agent_id_clone = owned_agent_id.clone();
-        let owned_agent_id_log = owned_agent_id.clone();
         let waiting_for_command = Arc::new(AtomicBool::new(false));
         let waiting_for_command_clone = waiting_for_command.clone();
         let interrupt_signal = Arc::new(async_lock::Mutex::new(None));
@@ -2634,7 +2588,6 @@ impl RunningWorker {
         );
         let handle = tokio::task::spawn(
             async move {
-                debug!(agent_id = %owned_agent_id_log, "Invocation loop task started");
                 RunningWorker::invocation_loop(
                     receiver,
                     active_clone,
@@ -2648,14 +2601,12 @@ impl RunningWorker {
                 )
                 .instrument(span)
                 .await;
-                debug!(agent_id = %owned_agent_id_log, "Invocation loop task exited");
             }
             .in_current_span(),
         );
 
         RunningWorker {
             handle: Some(handle),
-            owned_agent_id,
             sender,
             queue,
             memory_grant,