diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 095bdbd015..7a8d2fb9f1 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -836,10 +836,14 @@ jobs:
           - platform: linux/amd64
             name: linux/amd64
             target: x86_64-unknown-linux-gnu
+            target_cpu_env: CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS
+            target_cpu: "-C target-cpu=x86-64-v3"
           - platform: linux/arm64
             name: linux/arm64
             target: aarch64-unknown-linux-gnu
             cross: true
+            target_cpu_env: CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUSTFLAGS
+            target_cpu: "-C target-cpu=neoverse-n1"
     name: docker-targets-build (${{ matrix.platform.platform }})
     steps:
       - uses: actions/checkout@v5
@@ -854,6 +858,12 @@ jobs:
         run: |
           platform=${{ matrix.platform.platform }}
           echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
+          # Target-scoped rustflags pin the instruction-set baseline for the
+          # published images: x86-64-v3 on amd64, Neoverse-N1 on arm64. The
+          # per-target CARGO_TARGET_*_RUSTFLAGS form is used because plain
+          # RUSTFLAGS is ignored when cross-compiling; CARGO_-prefixed vars are
+          # also passed through into the cross container automatically.
+          echo "${{ matrix.platform.target_cpu_env }}=${{ matrix.platform.target_cpu }}" >> $GITHUB_ENV
 
       - run: cargo install cross
         if: ${{ matrix.platform.cross }}
diff --git a/Cargo.lock b/Cargo.lock
index 9296de06e0..1821488058 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4277,7 +4277,10 @@ dependencies = [
  "log",
  "mac_address",
  "md5",
+ "metrics",
+ "metrics-exporter-prometheus",
  "metrohash",
+ "mimalloc",
  "nonempty-collections",
  "nonzero_ext",
  "pgvector",
@@ -4301,6 +4304,7 @@ dependencies = [
  "tempfile",
  "test-r",
  "tokio",
+ "tokio-metrics",
  "tokio-stream",
  "tokio-tungstenite 0.25.0",
  "tokio-util",
@@ -5735,6 +5739,15 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 
+[[package]]
+name = "libmimalloc-sys"
+version = "0.1.49"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a45a52f43e1c16f667ccfe4dd8c85b7f7c204fd5e3bf46c5b0db9a5c3c0b8e9"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "libredox"
 version = "0.1.15"
@@ -6007,6 +6020,46 @@ dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "metrics"
+version = "0.24.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89550ee9f79e88fef3119de263694973a8adb26c21d75322164fb8c493039fe2"
+dependencies = [
+ "portable-atomic",
+ "rapidhash",
+]
+
+[[package]]
+name = "metrics-exporter-prometheus"
+version = "0.16.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd7399781913e5393588a8d8c6a2867bf85fb38eaf2502fdce465aad2dc6f034"
+dependencies = [
+ "base64 0.22.1",
+ "indexmap 2.14.0",
+ "metrics",
+ "metrics-util",
+ "quanta",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "metrics-util"
+version = "0.19.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8496cc523d1f94c1385dd8f0f0c2c480b2b8aeccb5b7e4485ad6365523ae376"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+ "hashbrown 0.15.5",
+ "metrics",
+ "quanta",
+ "rand 0.9.2",
+ "rand_xoshiro",
+ "sketches-ddsketch",
+]
+
 [[package]]
 name = "metrohash"
 version = "1.0.7"
@@ -6043,6 +6096,15 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "mimalloc"
+version = "0.1.52"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d4139bb28d14ad1facf21d5eb8825051b326e172d216b39f6d31df53cc97862"
+dependencies = [
+ "libmimalloc-sys",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -7669,6 +7731,21 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "quanta"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "once_cell",
+ "raw-cpuid",
+ "wasi",
+ "web-sys",
+ "winapi",
+]
+
 [[package]]
 name = "quick-error"
 version = "1.2.3"
@@ -7856,6 +7933,15 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "rand_xoshiro"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41"
+dependencies = [
+ "rand_core 0.9.5",
+]
+
 [[package]]
 name = "range-set-blaze"
 version = "0.1.16"
@@ -7868,6 +7954,24 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "rapidhash"
+version = "4.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "raw-cpuid"
+version = "11.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
+dependencies = [
+ "bitflags 2.11.1",
+]
+
 [[package]]
 name = "rayon"
 version = "1.11.0"
@@ -9103,6 +9207,12 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e"
 
+[[package]]
+name = "sketches-ddsketch"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b"
+
 [[package]]
 name = "slab"
 version = "0.4.12"
@@ -9946,6 +10056,19 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "tokio-metrics"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9e81d53caf955549b1dec7af4ac2149e94cc25ed97b4a545151140281e2f528"
+dependencies = [
+ "futures-util",
+ "metrics",
+ "pin-project-lite",
+ "tokio",
+ "tokio-stream",
+]
+
 [[package]]
 name = "tokio-native-tls"
 version = "0.3.1"
diff --git a/Cargo.toml b/Cargo.toml
index b6ba881258..32b71fd001 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -154,10 +154,13 @@ log = "0.4.26"
 mac_address = "1.1.8"
 mappable-rc = "0.1.1"
 md5 = "0.7.0"
+metrics = "0.24.2"
+metrics-exporter-prometheus = { version = "0.16.2", default-features = false }
 metrohash = "1.0.7"
 miette = { version = "7.6.0", features = ["fancy"] }
 mime = "0.3.17"
 mime_guess = "2.0.5"
+mimalloc = "0.1.52"
 minijinja = "2.7.0"
 
 nanoid = "0.4.0"
@@ -248,6 +251,7 @@ textwrap = "0.16.1"
 thiserror = "2.0.12"
 time = { version = "0.3.41", features = ["default", "macros"] }
 tokio = { version = "1.44", features = ["macros", "rt-multi-thread", "sync", "io-std", "net", "tracing", "process", "signal"] }
+tokio-metrics = { version = "0.5.0", features = ["metrics-rs-integration"] }
 tokio-postgres = "0.7.13"
 tokio-rustls = { version = "0.26.2" }
 tokio-stream = { version = "0.1", features = ["sync"] }
@@ -337,6 +341,7 @@ debug = "line-tables-only"
 
 [profile.release]
 panic = "abort"
+lto = "thin"
 
 [profile.benchmarks]
 inherits = "release"
diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env
index 077c693c32..4349e54ebe 100644
--- a/golem-debugging-service/config/debug-worker-executor.sample.env
+++ b/golem-debugging-service/config/debug-worker-executor.sample.env
@@ -55,6 +55,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
+GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
@@ -228,6 +230,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
+GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml
index 8ee03a9c23..01a81fd83a 100644
--- a/golem-debugging-service/config/debug-worker-executor.toml
+++ b/golem-debugging-service/config/debug-worker-executor.toml
@@ -96,6 +96,8 @@ max_oplog_query_pages_size = 100
 
 [memory]
 acquire_retry_delay = "500ms"
+component_size_coefficient = 2.0
+enable_measured_admission = true
 worker_estimate_coefficient = 1.1
 worker_memory_ratio = 0.8
 
@@ -364,6 +366,8 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
+# component_size_coefficient = 2.0
+# enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
 # 
diff --git a/golem-debugging-service/src/config.rs b/golem-debugging-service/src/config.rs
index dc6299652b..6a9869550b 100644
--- a/golem-debugging-service/src/config.rs
+++ b/golem-debugging-service/src/config.rs
@@ -98,6 +98,7 @@ impl DebugConfig {
             max_in_function_retry_delay: std::time::Duration::from_secs(20),
             max_websocket_connections: 100,
             quota_service: QuotaServiceConfig::default(),
+            runtime_metrics_sampling_interval: std::time::Duration::from_secs(5),
         }
     }
 }
diff --git a/golem-debugging-service/src/lib.rs b/golem-debugging-service/src/lib.rs
index d6062f2cf1..2dea553b6b 100644
--- a/golem-debugging-service/src/lib.rs
+++ b/golem-debugging-service/src/lib.rs
@@ -375,13 +375,19 @@ pub async fn run_debug_worker_executor<T: Bootstrap<DebugContext> + ?Sized + Sen
 ) -> anyhow::Result<RunDetails> {
     debug!("Initializing debug worker executor");
 
-    let total_system_memory = golem_config.memory.total_system_memory();
-    let system_memory = golem_config.memory.system_memory();
-    let worker_memory = golem_config.memory.worker_memory();
+    let memory_snapshot =
+        golem_worker_executor::services::active_workers::memory_probe::default_probe(
+            golem_config.memory.system_memory_override,
+        )
+        .snapshot();
+    let total_system_memory = memory_snapshot.limit_bytes;
+    let used_system_memory = memory_snapshot.current_bytes;
+    let worker_memory =
+        (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64;
     info!(
-        "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}",
+        "Measured memory limit: {}, Currently used: {}, Usable for workers: {}",
         ISizeFormatter::new(total_system_memory, humansize::BINARY),
-        ISizeFormatter::new(system_memory, humansize::BINARY),
+        ISizeFormatter::new(used_system_memory, humansize::BINARY),
         ISizeFormatter::new(worker_memory, humansize::BINARY)
     );
 
diff --git a/golem-service-base/src/observability.rs b/golem-service-base/src/observability.rs
index 98a83dd36e..f9f54b554c 100644
--- a/golem-service-base/src/observability.rs
+++ b/golem-service-base/src/observability.rs
@@ -18,21 +18,42 @@ use axum::response::IntoResponse;
 use axum::routing::get;
 use http::Response;
 use prometheus::{Encoder, Registry, TextEncoder};
+use std::sync::Arc;
 use tokio::net::{TcpListener, ToSocketAddrs};
 use tokio::task::JoinSet;
 use tracing::{Instrument, info};
 
+/// A callback that renders additional metrics in Prometheus text exposition
+/// format, appended to the output of the `prometheus`-crate registry on the
+/// `/metrics` endpoint. Used to surface metrics from a second metrics façade
+/// (e.g. the `metrics`-crate recorder driving tokio-metrics) on the same
+/// scrape endpoint.
+pub type ExtraMetrics = Arc<dyn Fn() -> String + Send + Sync>;
+
 pub async fn start_health_and_metrics_server(
     addr: impl ToSocketAddrs,
     registry: Registry,
     body_message: &'static str,
     join_set: &mut JoinSet<Result<(), anyhow::Error>>,
+) -> Result<u16, anyhow::Error> {
+    start_health_and_metrics_server_with_extra(addr, registry, None, body_message, join_set).await
+}
+
+pub async fn start_health_and_metrics_server_with_extra(
+    addr: impl ToSocketAddrs,
+    registry: Registry,
+    extra: Option<ExtraMetrics>,
+    body_message: &'static str,
+    join_set: &mut JoinSet<Result<(), anyhow::Error>>,
 ) -> Result<u16, anyhow::Error> {
     let app = Router::new()
         .route("/healthcheck", get(move || async move { body_message }))
         .route(
             "/metrics",
-            get(|| async move { prometheus_metrics(registry.clone()) }),
+            get(move || {
+                let extra = extra.clone();
+                async move { prometheus_metrics(registry.clone(), extra) }
+            }),
         );
 
     let listener = TcpListener::bind(addr).await?;
@@ -51,13 +72,17 @@ pub async fn start_health_and_metrics_server(
     Ok(local_addr.port())
 }
 
-pub fn prometheus_metrics(registry: Registry) -> impl IntoResponse {
+pub fn prometheus_metrics(registry: Registry, extra: Option<ExtraMetrics>) -> impl IntoResponse {
     let encoder = TextEncoder::new();
     let mut buffer = Vec::new();
 
     let metric_families = registry.gather();
     encoder.encode(&metric_families, &mut buffer).unwrap();
 
+    if let Some(extra) = extra {
+        buffer.extend_from_slice(extra().as_bytes());
+    }
+
     Response::builder()
         .header("Content-Type", encoder.format_type())
         .body(Body::from(buffer))
diff --git a/golem-test-framework/src/benchmark/config.rs b/golem-test-framework/src/benchmark/config.rs
index c011ac65b0..0d172baa24 100644
--- a/golem-test-framework/src/benchmark/config.rs
+++ b/golem-test-framework/src/benchmark/config.rs
@@ -116,7 +116,7 @@ pub struct BenchmarkSuiteItem {
 impl BenchmarkSuiteItem {
     pub fn runs(&self, mode: &TestMode) -> Vec<RunConfig> {
         let cluster_size: Vec<usize> = match mode {
-            TestMode::Provided { .. } => {
+            TestMode::Provided { .. } | TestMode::Cloud { .. } => {
                 vec![0]
             }
             _ => self
@@ -163,3 +163,83 @@ impl BenchmarkSuiteItem {
         res
     }
 }
+
+/// Smoke tests for cloud-mode wiring that do not require running services.
+///
+/// For a full end-to-end smoke test that exercises actual HTTP clients,
+/// cleanup, and the benchmark API contract, run the binary directly against a
+/// local Spawned cluster:
+///
+/// ```text
+/// cargo run --bin benchmarks -- benchmark cold-start-unknown-small \
+///   --size 1 --iterations 1 --length 0 \
+///   cloud \
+///   --api-url http://localhost:8081 \
+///   --apps-base-domain golem.cloud \
+///   --admin-account-id <uuid> \
+///   --admin-account-email <email> \
+///   --admin-account-token <token> \
+///   --builtin-plugin-owner-account-id <uuid> \
+///   --default-plan-id <uuid>
+/// ```
+#[cfg(test)]
+mod cloud_mode_smoke {
+    use super::*;
+    use test_r::test;
+    use url::Url;
+    use uuid::Uuid;
+
+    fn cloud_mode() -> TestMode {
+        TestMode::Cloud {
+            api_url: Url::parse("https://release.dev-api.golem.cloud").unwrap(),
+            apps_base_domain: "apps.dev.golem.cloud".to_string(),
+            admin_account_token: "test-token".to_string(),
+            builtin_plugin_owner_account_id: Uuid::nil(),
+            default_plan_id: Uuid::nil(),
+            shard_manager_grpc_host: None,
+            shard_manager_grpc_port: None,
+            component_directory: "test-components".to_string(),
+        }
+    }
+
+    /// Cloud mode always returns exactly one `RunConfig` with `cluster_size=0`,
+    /// regardless of how many `cluster_size` values the suite item specifies.
+    #[test]
+    fn runs_returns_single_cluster_size_zero_run() {
+        let mode = cloud_mode();
+        let item = BenchmarkSuiteItem {
+            name: "cold-start-unknown-small".to_string(),
+            iterations: 3,
+            cluster_size: vec![1, 3, 5], // must be ignored in cloud mode
+            size: vec![10],
+            length: vec![100],
+            disable_compilation_cache: None,
+        };
+        let runs = item.runs(&mode);
+        assert_eq!(runs.len(), 1, "cloud mode ignores cluster_size variations");
+        assert_eq!(runs[0].cluster_size, 0, "cloud mode cluster_size must be 0");
+        assert_eq!(runs[0].size, 10);
+        assert_eq!(runs[0].length, 100);
+    }
+
+    /// Multiple size and length combinations still expand normally; only
+    /// `cluster_size` is collapsed.
+    #[test]
+    fn runs_expands_size_and_length_but_not_cluster_size() {
+        let mode = cloud_mode();
+        let item = BenchmarkSuiteItem {
+            name: "latency-small".to_string(),
+            iterations: 1,
+            cluster_size: vec![1, 3],
+            size: vec![5, 10],
+            length: vec![50, 100],
+            disable_compilation_cache: None,
+        };
+        let runs = item.runs(&mode);
+        // 1 (collapsed cluster_size) × 2 sizes × 2 lengths = 4 runs
+        assert_eq!(runs.len(), 4);
+        for r in &runs {
+            assert_eq!(r.cluster_size, 0);
+        }
+    }
+}
diff --git a/golem-test-framework/src/benchmark/mod.rs b/golem-test-framework/src/benchmark/mod.rs
index 1f349afddd..5e82adde15 100644
--- a/golem-test-framework/src/benchmark/mod.rs
+++ b/golem-test-framework/src/benchmark/mod.rs
@@ -16,7 +16,9 @@ mod config;
 mod results;
 
 pub use config::{BenchmarkConfig, BenchmarkSuite, BenchmarkSuiteItem, RunConfig};
-pub use results::{BenchmarkResult, BenchmarkRunResult, BenchmarkSuiteResult, ResultKey};
+pub use results::{
+    BenchmarkResult, BenchmarkRunResult, BenchmarkSuiteResult, ResultKey, RunMetadata,
+};
 
 use crate::config::benchmark::TestMode;
 use async_trait::async_trait;
@@ -301,6 +303,7 @@ impl<B: Benchmark> BenchmarkApi for B {
             description: B::description().to_string(),
             runs,
             results,
+            run_id: None,
         }
     }
 }
diff --git a/golem-test-framework/src/benchmark/results.rs b/golem-test-framework/src/benchmark/results.rs
index 1cb0f329b6..afb7319a7d 100644
--- a/golem-test-framework/src/benchmark/results.rs
+++ b/golem-test-framework/src/benchmark/results.rs
@@ -484,6 +484,97 @@ impl Display for BenchmarkResultView {
     }
 }
 
+/// Cloud-mode run metadata collected by the buildspec and passed via environment variables.
+/// All fields are optional — missing env vars produce `None` rather than failing the run.
+#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct RunMetadata {
+    /// The `golem-oss` commit SHA that was built and deployed.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub golem_oss_commit_sha: Option<String>,
+    /// The `golem-cloud` (kubernetes manifests) commit SHA that was deployed.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub kubernetes_manifest_commit_sha: Option<String>,
+    /// Number of Ready `worker-executor` pods observed at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub observed_cluster_size: Option<u32>,
+    /// Container image tag of the deployed `worker-executor`.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub worker_executor_image_tag: Option<String>,
+    /// Container image tag of the deployed `registry-service`.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub registry_service_image_tag: Option<String>,
+    /// Container image tag of the deployed `worker-service`.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub worker_service_image_tag: Option<String>,
+    /// Aurora ACU capacity for the main (`golem_dev`) cluster at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub aurora_acu_main: Option<f64>,
+    /// Aurora ACU capacity for the indexed-storage cluster at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub aurora_acu_indexed: Option<f64>,
+    /// Aurora ACU capacity for the keyvalue-storage cluster at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub aurora_acu_keyvalue: Option<f64>,
+    /// Ready replica count for `worker-executor` at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub worker_executor_replicas: Option<u32>,
+    /// Ready replica count for `worker-service` at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub worker_service_replicas: Option<u32>,
+    /// Ready replica count for `registry-service` at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub registry_service_replicas: Option<u32>,
+    /// Ready replica count for `compilation-service` at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub compilation_service_replicas: Option<u32>,
+    /// Ready replica count for `debugging-service` at run start.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub debugging_service_replicas: Option<u32>,
+    /// Free-form note from the `workflow_dispatch` trigger.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub note: Option<String>,
+}
+
+impl RunMetadata {
+    /// Reads all `GOLEM_BENCH_*` environment variables and returns a populated
+    /// `RunMetadata`.  Missing variables produce `None` for that field.
+    pub fn from_env() -> Self {
+        fn env_str(key: &str) -> Option<String> {
+            std::env::var(key).ok().filter(|v| !v.is_empty())
+        }
+        fn env_u32(key: &str) -> Option<u32> {
+            env_str(key).and_then(|v| v.parse().ok())
+        }
+        fn env_f64(key: &str) -> Option<f64> {
+            env_str(key).and_then(|v| v.parse().ok())
+        }
+
+        Self {
+            golem_oss_commit_sha: env_str("GOLEM_BENCH_OSS_COMMIT_SHA"),
+            kubernetes_manifest_commit_sha: env_str("GOLEM_BENCH_K8S_MANIFEST_COMMIT_SHA"),
+            observed_cluster_size: env_u32("GOLEM_BENCH_OBSERVED_CLUSTER_SIZE"),
+            worker_executor_image_tag: env_str("GOLEM_BENCH_WORKER_EXECUTOR_IMAGE_TAG"),
+            registry_service_image_tag: env_str("GOLEM_BENCH_REGISTRY_SERVICE_IMAGE_TAG"),
+            worker_service_image_tag: env_str("GOLEM_BENCH_WORKER_SERVICE_IMAGE_TAG"),
+            aurora_acu_main: env_f64("GOLEM_BENCH_AURORA_ACU_MAIN"),
+            aurora_acu_indexed: env_f64("GOLEM_BENCH_AURORA_ACU_INDEXED"),
+            aurora_acu_keyvalue: env_f64("GOLEM_BENCH_AURORA_ACU_KEYVALUE"),
+            worker_executor_replicas: env_u32("GOLEM_BENCH_WORKER_EXECUTOR_REPLICAS"),
+            worker_service_replicas: env_u32("GOLEM_BENCH_WORKER_SERVICE_REPLICAS"),
+            registry_service_replicas: env_u32("GOLEM_BENCH_REGISTRY_SERVICE_REPLICAS"),
+            compilation_service_replicas: env_u32("GOLEM_BENCH_COMPILATION_SERVICE_REPLICAS"),
+            debugging_service_replicas: env_u32("GOLEM_BENCH_DEBUGGING_SERVICE_REPLICAS"),
+            note: env_str("GOLEM_BENCH_RUN_NOTE"),
+        }
+    }
+
+    /// Returns `true` if every field is `None` (nothing was read from env).
+    pub fn is_empty(&self) -> bool {
+        self == &Self::default()
+    }
+}
+
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct BenchmarkSuiteResultCollection {
     pub runs: Vec<BenchmarkSuiteResult>,
@@ -491,10 +582,20 @@ pub struct BenchmarkSuiteResultCollection {
 
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct BenchmarkSuiteResult {
+    /// Result format version. Always `1` for results produced by this binary.
+    pub schema_version: u32,
     pub suite: String,
     pub environment: String,
     pub version: String,
     pub timestamp: DateTime<Utc>,
+    /// Suite-level run-id. Set in cloud mode to `bench-{run_id}` to allow
+    /// cross-run correlation and garbage collection of orphaned state.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub run_id: Option<String>,
+    /// Cloud-mode run metadata populated from `GOLEM_BENCH_*` environment variables.
+    /// `None` in Spawned or Provided modes where cluster metadata is not available.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub run_metadata: Option<RunMetadata>,
     pub results: Vec<BenchmarkResult>,
 }
 
@@ -526,10 +627,13 @@ impl BenchmarkSuiteResult {
         );
 
         Self {
+            schema_version: 1,
             suite: suite.to_string(),
             environment,
             version: golem_common::golem_version().to_string(),
             timestamp: Utc::now(),
+            run_id: None,
+            run_metadata: None,
             results: vec![],
         }
     }
@@ -606,6 +710,10 @@ pub struct BenchmarkResult {
     pub description: String,
     pub runs: Vec<RunConfig>,
     pub results: Vec<BenchmarkRunResult>,
+    /// Suite-level run-id. Set in cloud mode to `bench-{run_id}` to allow
+    /// cross-run correlation and garbage collection of orphaned state.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub run_id: Option<String>,
 }
 
 impl BenchmarkResult {
diff --git a/golem-test-framework/src/components/component_compilation_service/mod.rs b/golem-test-framework/src/components/component_compilation_service/mod.rs
index f80d2f84d6..50da099698 100644
--- a/golem-test-framework/src/components/component_compilation_service/mod.rs
+++ b/golem-test-framework/src/components/component_compilation_service/mod.rs
@@ -21,6 +21,7 @@ use tracing::Level;
 
 pub mod provided;
 pub mod spawned;
+pub mod unavailable;
 
 #[async_trait]
 pub trait ComponentCompilationService: Send + Sync {
diff --git a/golem-test-framework/src/components/component_compilation_service/unavailable.rs b/golem-test-framework/src/components/component_compilation_service/unavailable.rs
new file mode 100644
index 0000000000..fb355cd0b3
--- /dev/null
+++ b/golem-test-framework/src/components/component_compilation_service/unavailable.rs
@@ -0,0 +1,35 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::ComponentCompilationService;
+use async_trait::async_trait;
+
+/// A `ComponentCompilationService` that is not directly reachable. Used in
+/// cloud mode, where it is an internal cluster component with no external
+/// exposure. `kill` is a no-op so that `kill_all()` completes; operational
+/// methods panic with a clear message.
+pub struct UnavailableComponentCompilationService;
+
+#[async_trait]
+impl ComponentCompilationService for UnavailableComponentCompilationService {
+    fn grpc_host(&self) -> String {
+        panic!("component_compilation_service() is not available in cloud mode");
+    }
+
+    fn grpc_port(&self) -> u16 {
+        panic!("component_compilation_service() is not available in cloud mode");
+    }
+
+    async fn kill(&self) {}
+}
diff --git a/golem-test-framework/src/components/rdb/mod.rs b/golem-test-framework/src/components/rdb/mod.rs
index 5f1b5c7fb8..ce8863c10e 100644
--- a/golem-test-framework/src/components/rdb/mod.rs
+++ b/golem-test-framework/src/components/rdb/mod.rs
@@ -29,6 +29,7 @@ pub mod docker_mysql;
 pub mod docker_postgres;
 pub mod provided_postgres;
 pub mod sqlite;
+pub mod unavailable;
 
 #[async_trait]
 pub trait Rdb: Send + Sync {
diff --git a/golem-test-framework/src/components/rdb/unavailable.rs b/golem-test-framework/src/components/rdb/unavailable.rs
new file mode 100644
index 0000000000..1df99efe70
--- /dev/null
+++ b/golem-test-framework/src/components/rdb/unavailable.rs
@@ -0,0 +1,31 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{DbInfo, Rdb};
+use async_trait::async_trait;
+
+/// An `Rdb` that is not directly reachable. Used in cloud mode, where the
+/// database is an internal cluster component with no external exposure.
+/// Lifecycle teardown (`kill`) is a no-op so that `kill_all()` completes;
+/// operational methods panic with a clear message.
+pub struct UnavailableRdb;
+
+#[async_trait]
+impl Rdb for UnavailableRdb {
+    fn info(&self) -> DbInfo {
+        panic!("rdb() is not available in cloud mode");
+    }
+
+    async fn kill(&self) {}
+}
diff --git a/golem-test-framework/src/components/redis/mod.rs b/golem-test-framework/src/components/redis/mod.rs
index df14595c7a..62346ec293 100644
--- a/golem-test-framework/src/components/redis/mod.rs
+++ b/golem-test-framework/src/components/redis/mod.rs
@@ -20,6 +20,7 @@ use tracing::info;
 pub mod provided;
 pub mod spawned;
 pub mod spawned_tls;
+pub mod unavailable;
 
 #[async_trait]
 pub trait Redis: Send + Sync {
diff --git a/golem-test-framework/src/components/redis/unavailable.rs b/golem-test-framework/src/components/redis/unavailable.rs
new file mode 100644
index 0000000000..0f24489fe9
--- /dev/null
+++ b/golem-test-framework/src/components/redis/unavailable.rs
@@ -0,0 +1,43 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::Redis;
+use async_trait::async_trait;
+
+/// A `Redis` that is not directly reachable. Used in cloud mode, where Redis
+/// is an internal cluster component with no external exposure. `kill` is a
+/// no-op so that `kill_all()` completes; operational methods panic with a
+/// clear message.
+pub struct UnavailableRedis;
+
+#[async_trait]
+impl Redis for UnavailableRedis {
+    fn assert_valid(&self) {
+        panic!("redis() is not available in cloud mode");
+    }
+
+    fn private_host(&self) -> String {
+        panic!("redis() is not available in cloud mode");
+    }
+
+    fn private_port(&self) -> u16 {
+        panic!("redis() is not available in cloud mode");
+    }
+
+    fn prefix(&self) -> &str {
+        panic!("redis() is not available in cloud mode");
+    }
+
+    async fn kill(&self) {}
+}
diff --git a/golem-test-framework/src/components/redis_monitor/mod.rs b/golem-test-framework/src/components/redis_monitor/mod.rs
index eb73fe0e0d..2a24665ec5 100644
--- a/golem-test-framework/src/components/redis_monitor/mod.rs
+++ b/golem-test-framework/src/components/redis_monitor/mod.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 pub mod spawned;
+pub mod unavailable;
 
 pub trait RedisMonitor: Send + Sync {
     fn assert_valid(&self);
diff --git a/golem-test-framework/src/components/redis_monitor/unavailable.rs b/golem-test-framework/src/components/redis_monitor/unavailable.rs
new file mode 100644
index 0000000000..bdde53d231
--- /dev/null
+++ b/golem-test-framework/src/components/redis_monitor/unavailable.rs
@@ -0,0 +1,29 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::RedisMonitor;
+
+/// A `RedisMonitor` that is not directly reachable. Used in cloud mode, where
+/// Redis is an internal cluster component with no external exposure. `kill` is
+/// a no-op so that `kill_all()` completes; operational methods panic with a
+/// clear message.
+pub struct UnavailableRedisMonitor;
+
+impl RedisMonitor for UnavailableRedisMonitor {
+    fn assert_valid(&self) {
+        panic!("redis_monitor() is not available in cloud mode");
+    }
+
+    fn kill(&self) {}
+}
diff --git a/golem-test-framework/src/components/registry_service/cloud.rs b/golem-test-framework/src/components/registry_service/cloud.rs
new file mode 100644
index 0000000000..79e5d03935
--- /dev/null
+++ b/golem-test-framework/src/components/registry_service/cloud.rs
@@ -0,0 +1,167 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::RegistryService;
+use async_trait::async_trait;
+use golem_client::api::RegistryServiceClientLive;
+use golem_client::{Context, Security};
+use golem_common::model::account::{AccountEmail, AccountId};
+use golem_common::model::auth::TokenSecret;
+use golem_common::model::plan::PlanId;
+use std::time::Duration;
+use tokio::sync::OnceCell;
+use tracing::info;
+use url::Url;
+
+/// Registry-service client for cloud mode.
+///
+/// In the deployed Golem environment both registry-service and worker-service
+/// are reachable behind a single Gateway API hostname
+/// (e.g. `https://release.dev-api.golem.cloud`). This struct holds that shared
+/// `api_url`; routing to the correct backend service is done by the Gateway
+/// based on URL path.
+pub struct CloudRegistryService {
+    api_url: Url,
+    admin_token: TokenSecret,
+    builtin_plugin_owner_account_id: AccountId,
+    default_plan_id: PlanId,
+    base_http_client: OnceCell<reqwest_middleware::ClientWithMiddleware>,
+}
+
+impl CloudRegistryService {
+    pub fn new(
+        api_url: Url,
+        admin_token: TokenSecret,
+        builtin_plugin_owner_account_id: AccountId,
+        default_plan_id: PlanId,
+    ) -> Self {
+        info!("Using cloud API gateway at {api_url}");
+        Self {
+            api_url,
+            admin_token,
+            builtin_plugin_owner_account_id,
+            default_plan_id,
+            base_http_client: OnceCell::new(),
+        }
+    }
+}
+
+/// Constructs the tuned HTTP client for cloud-mode benchmark connections.
+///
+/// Settings: large connection pool (1024), 90-second idle timeout, TCP
+/// nodelay, and 180-second request timeout.
+///
+/// Note: `http2_prior_knowledge()` is deliberately **not** set. Prior
+/// knowledge is for h2c (HTTP/2 over plain HTTP). All cloud endpoints are
+/// HTTPS, where HTTP/2 is negotiated through ALPN during the TLS handshake
+/// (TLS termination happens at Envoy). Setting prior knowledge would bypass
+/// ALPN and can cause protocol errors.
+pub fn new_cloud_reqwest_client() -> reqwest_middleware::ClientWithMiddleware {
+    let client = reqwest::ClientBuilder::new()
+        .pool_max_idle_per_host(1024)
+        .pool_idle_timeout(Duration::from_secs(90))
+        .tcp_nodelay(true)
+        .timeout(Duration::from_secs(180))
+        .build()
+        .expect("Failed to build cloud HTTP client");
+    reqwest_middleware::ClientBuilder::new(client)
+        .with(reqwest_tracing::TracingMiddleware::default())
+        .build()
+}
+
+#[async_trait]
+impl RegistryService for CloudRegistryService {
+    fn http_host(&self) -> String {
+        self.api_url.host_str().unwrap_or("localhost").to_string()
+    }
+
+    fn http_port(&self) -> u16 {
+        self.api_url.port_or_known_default().unwrap_or(443)
+    }
+
+    fn grpc_host(&self) -> String {
+        panic!("grpc_host() is not available through the Gateway in cloud mode");
+    }
+
+    fn grpc_port(&self) -> u16 {
+        panic!("grpc_port() is not available through the Gateway in cloud mode");
+    }
+
+    fn admin_account_id(&self) -> AccountId {
+        AccountId(uuid::Uuid::nil())
+    }
+
+    fn admin_account_email(&self) -> AccountEmail {
+        AccountEmail::new(String::new())
+    }
+
+    fn admin_account_token(&self) -> TokenSecret {
+        self.admin_token.clone()
+    }
+
+    fn builtin_plugin_owner_account_id(&self) -> AccountId {
+        self.builtin_plugin_owner_account_id
+    }
+
+    fn default_plan(&self) -> PlanId {
+        self.default_plan_id
+    }
+
+    fn low_fuel_plan(&self) -> PlanId {
+        panic!(
+            "low_fuel_plan is not supported in cloud mode; \
+             the benchmark calling this method requires a local or provided cluster"
+        );
+    }
+
+    fn low_disk_space_plan(&self) -> PlanId {
+        panic!(
+            "low_disk_space_plan is not supported in cloud mode; \
+             the benchmark calling this method requires a local or provided cluster"
+        );
+    }
+
+    fn low_http_calls_plan(&self) -> PlanId {
+        panic!(
+            "low_http_calls_plan is not supported in cloud mode; \
+             the benchmark calling this method requires a local or provided cluster"
+        );
+    }
+
+    fn low_rpc_calls_plan(&self) -> PlanId {
+        panic!(
+            "low_rpc_calls_plan is not supported in cloud mode; \
+             the benchmark calling this method requires a local or provided cluster"
+        );
+    }
+
+    async fn kill(&self) {}
+
+    async fn base_http_client(&self) -> reqwest_middleware::ClientWithMiddleware {
+        self.base_http_client
+            .get_or_init(|| async { new_cloud_reqwest_client() })
+            .await
+            .clone()
+    }
+
+    async fn client(&self, token: &TokenSecret) -> RegistryServiceClientLive {
+        RegistryServiceClientLive {
+            context: Context {
+                client: self.base_http_client().await,
+                base_url: self.api_url.clone(),
+                security_token: Security::Bearer(token.secret().to_string()),
+            },
+        }
+    }
+}
diff --git a/golem-test-framework/src/components/registry_service/mod.rs b/golem-test-framework/src/components/registry_service/mod.rs
index d38f88577d..42b0b9ddfd 100644
--- a/golem-test-framework/src/components/registry_service/mod.rs
+++ b/golem-test-framework/src/components/registry_service/mod.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+pub mod cloud;
 pub mod provided;
 pub mod spawned;
 
diff --git a/golem-test-framework/src/components/shard_manager/mod.rs b/golem-test-framework/src/components/shard_manager/mod.rs
index 5245865e4b..91ed2ed2da 100644
--- a/golem-test-framework/src/components/shard_manager/mod.rs
+++ b/golem-test-framework/src/components/shard_manager/mod.rs
@@ -14,6 +14,7 @@
 
 pub mod provided;
 pub mod spawned;
+pub mod unavailable;
 
 use super::rdb::Rdb;
 use super::registry_service::RegistryService;
@@ -30,7 +31,7 @@ use std::sync::Arc;
 use std::time::Duration;
 use tonic::codec::CompressionEncoding;
 use tonic::transport::Channel;
-use tracing::Level;
+use tracing::{Level, warn};
 
 #[async_trait]
 pub trait ShardManager: Send + Sync {
@@ -46,25 +47,30 @@ pub trait ShardManager: Send + Sync {
     async fn restart(&self, number_of_shards_override: Option<usize>);
 
     async fn get_routing_table(&self) -> crate::Result<RoutingTable> {
-        let routing_table = self
-            .client()
-            .await
-            .get_routing_table(GetRoutingTableRequest {})
-            .await
-            .expect("Unable to fetch the routing table from shard-manager-service");
-
-        match routing_table.into_inner() {
-            shardmanager::v1::GetRoutingTableResponse {
-                result:
-                    Some(shardmanager::v1::get_routing_table_response::Result::Success(routing_table)),
-            } => Ok(routing_table
-                .try_into()
-                .map_err(|e| anyhow!("Failed converting routing table: {e}"))?),
-            shardmanager::v1::GetRoutingTableResponse {
-                result: Some(shardmanager::v1::get_routing_table_response::Result::Failure(err)),
-            } => Err(anyhow!("Failed to get routing table: {err:?}")),
-            _ => Err(anyhow!("Failed to get routing table")),
+        // Retry with backoff to tolerate transient port-forward reconnects.
+        // The port-forward watchdog restarts in ~500ms, so 10 attempts with
+        // 1s delay gives ~10s of tolerance before giving up.
+        let max_attempts = 10;
+        let retry_delay = Duration::from_secs(1);
+        let mut last_err = anyhow!("get_routing_table: no attempts made");
+
+        for attempt in 1..=max_attempts {
+            match try_get_routing_table(&self.grpc_host(), self.grpc_port()).await {
+                Ok(rt) => return Ok(rt),
+                Err(err) => {
+                    warn!(
+                        attempt,
+                        max_attempts,
+                        error = %err,
+                        "Failed to fetch routing table, retrying..."
+                    );
+                    last_err = err;
+                    tokio::time::sleep(retry_delay).await;
+                }
+            }
         }
+
+        Err(last_err)
     }
 }
 
@@ -76,6 +82,34 @@ async fn new_client(host: &str, grpc_port: u16) -> ShardManagerServiceClient<Cha
         .accept_compressed(CompressionEncoding::Gzip)
 }
 
+async fn try_get_routing_table(host: &str, grpc_port: u16) -> crate::Result<RoutingTable> {
+    let mut client = ShardManagerServiceClient::connect(format!("http://{host}:{grpc_port}"))
+        .await
+        .map_err(|e| anyhow!("Failed to connect to shard-manager: {e}"))?
+        .send_compressed(CompressionEncoding::Gzip)
+        .accept_compressed(CompressionEncoding::Gzip);
+
+    let routing_table = client
+        .get_routing_table(GetRoutingTableRequest {})
+        .await
+        .map_err(|e| {
+            anyhow!("Unable to fetch the routing table from shard-manager-service: {e}")
+        })?;
+
+    match routing_table.into_inner() {
+        shardmanager::v1::GetRoutingTableResponse {
+            result:
+                Some(shardmanager::v1::get_routing_table_response::Result::Success(routing_table)),
+        } => Ok(routing_table
+            .try_into()
+            .map_err(|e| anyhow!("Failed converting routing table: {e}"))?),
+        shardmanager::v1::GetRoutingTableResponse {
+            result: Some(shardmanager::v1::get_routing_table_response::Result::Failure(err)),
+        } => Err(anyhow!("Failed to get routing table: {err:?}")),
+        _ => Err(anyhow!("Failed to get routing table")),
+    }
+}
+
 async fn wait_for_startup(
     host: &str,
     grpc_port: u16,
diff --git a/golem-test-framework/src/components/shard_manager/provided.rs b/golem-test-framework/src/components/shard_manager/provided.rs
index d7e4ff1305..84d213a5fb 100644
--- a/golem-test-framework/src/components/shard_manager/provided.rs
+++ b/golem-test-framework/src/components/shard_manager/provided.rs
@@ -40,10 +40,10 @@ impl ShardManager for ProvidedShardManager {
     }
 
     async fn kill(&self) {
-        panic!("Cannot kill provided shard manager");
+        // Nothing to do — we do not own this shard manager process.
     }
 
     async fn restart(&self, _number_of_shards_override: Option<usize>) {
-        panic!("Cannot restart provided shard manager");
+        // Nothing to do — we do not own this shard manager process.
     }
 }
diff --git a/golem-test-framework/src/components/shard_manager/unavailable.rs b/golem-test-framework/src/components/shard_manager/unavailable.rs
new file mode 100644
index 0000000000..834dfb8d2c
--- /dev/null
+++ b/golem-test-framework/src/components/shard_manager/unavailable.rs
@@ -0,0 +1,56 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::ShardManager;
+use async_trait::async_trait;
+use golem_common::model::RoutingTable;
+
+/// A `ShardManager` that is not directly reachable. Used in cloud mode when no
+/// shard-manager port-forward is configured; pass `--shard-manager-grpc-host`
+/// and `--shard-manager-grpc-port` to use a real `ProvidedShardManager`
+/// instead.
+///
+/// `kill`/`restart` are no-ops. `get_routing_table()` returns an error so that
+/// callers (e.g. the throughput benchmark) can fall back to the unlabeled
+/// single-bucket mode. The host/port accessors panic with a clear message.
+pub struct UnavailableShardManager;
+
+#[async_trait]
+impl ShardManager for UnavailableShardManager {
+    fn grpc_host(&self) -> String {
+        panic!(
+            "shard_manager() requires --shard-manager-grpc-host and \
+             --shard-manager-grpc-port to be configured in cloud mode"
+        );
+    }
+
+    fn grpc_port(&self) -> u16 {
+        panic!(
+            "shard_manager() requires --shard-manager-grpc-host and \
+             --shard-manager-grpc-port to be configured in cloud mode"
+        );
+    }
+
+    async fn kill(&self) {}
+
+    async fn restart(&self, _number_of_shards_override: Option<usize>) {}
+
+    async fn get_routing_table(&self) -> crate::Result<RoutingTable> {
+        Err(anyhow::anyhow!(
+            "shard_manager is not configured in cloud mode; \
+             pass --shard-manager-grpc-host and --shard-manager-grpc-port \
+             to enable routing table fetch and local/remote RPC labeling"
+        ))
+    }
+}
diff --git a/golem-test-framework/src/components/worker_executor_cluster/mod.rs b/golem-test-framework/src/components/worker_executor_cluster/mod.rs
index 2dc8e21745..e1db10b237 100644
--- a/golem-test-framework/src/components/worker_executor_cluster/mod.rs
+++ b/golem-test-framework/src/components/worker_executor_cluster/mod.rs
@@ -18,6 +18,7 @@ use std::sync::Arc;
 
 pub mod provided;
 pub mod spawned;
+pub mod unavailable;
 
 #[async_trait]
 pub trait WorkerExecutorCluster: Send + Sync {
diff --git a/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs b/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs
new file mode 100644
index 0000000000..53a5cc87be
--- /dev/null
+++ b/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs
@@ -0,0 +1,63 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::components::worker_executor::WorkerExecutor;
+use crate::components::worker_executor_cluster::WorkerExecutorCluster;
+use async_trait::async_trait;
+use std::sync::Arc;
+
+/// A `WorkerExecutorCluster` whose individual executors are not directly
+/// reachable. Used in cloud mode, where executors are internal cluster
+/// components with no external exposure.
+///
+/// Lifecycle teardown methods (`kill_all`, `restart_all`) are no-ops so that
+/// `kill_all()` completes. `is_running()` returns `true` so that
+/// `ensure_all_deps_running()` is a no-op. Per-executor operations panic with a
+/// clear message.
+pub struct UnavailableWorkerExecutorCluster;
+
+#[async_trait]
+impl WorkerExecutorCluster for UnavailableWorkerExecutorCluster {
+    fn size(&self) -> usize {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    async fn kill_all(&self) {}
+
+    async fn restart_all(&self) {}
+
+    async fn stop(&self, _index: usize) {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    async fn start(&self, _index: usize) {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    fn to_vec(&self) -> Vec<Arc<dyn WorkerExecutor>> {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    async fn stopped_indices(&self) -> Vec<usize> {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    async fn started_indices(&self) -> Vec<usize> {
+        panic!("worker_executor_cluster() is not available in cloud mode");
+    }
+
+    async fn is_running(&self) -> bool {
+        true
+    }
+}
diff --git a/golem-test-framework/src/components/worker_service/cloud.rs b/golem-test-framework/src/components/worker_service/cloud.rs
new file mode 100644
index 0000000000..ceb60f4fbe
--- /dev/null
+++ b/golem-test-framework/src/components/worker_service/cloud.rs
@@ -0,0 +1,113 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::components::registry_service::cloud::new_cloud_reqwest_client;
+use crate::components::worker_service::WorkerService;
+use async_trait::async_trait;
+use golem_client::api::{AgentClientLive, WorkerClientLive};
+use golem_client::{Context, Security};
+use golem_common::model::auth::TokenSecret;
+use tokio::sync::OnceCell;
+use tracing::info;
+use url::Url;
+
+/// Worker-service client for cloud mode.
+///
+/// In the deployed Golem environment both registry-service and worker-service
+/// are reachable behind a single Gateway API hostname
+/// (e.g. `https://release.dev-api.golem.cloud`). This struct holds that shared
+/// `api_url`; routing to worker-service is done by the Gateway based on URL
+/// path (`/v1/components/*/workers/**`, `/v1/agents/**`).
+pub struct CloudWorkerService {
+    api_url: Url,
+    base_http_client: OnceCell<reqwest_middleware::ClientWithMiddleware>,
+}
+
+impl CloudWorkerService {
+    pub fn new(api_url: Url) -> Self {
+        info!("Using cloud worker-service via API gateway at {api_url}");
+        Self {
+            api_url,
+            base_http_client: OnceCell::new(),
+        }
+    }
+}
+
+#[async_trait]
+impl WorkerService for CloudWorkerService {
+    fn http_host(&self) -> String {
+        self.api_url.host_str().unwrap_or("localhost").to_string()
+    }
+
+    fn http_port(&self) -> u16 {
+        self.api_url.port_or_known_default().unwrap_or(443)
+    }
+
+    fn grpc_host(&self) -> String {
+        panic!("grpc_host() is not available through the Gateway in cloud mode");
+    }
+
+    fn gprc_port(&self) -> u16 {
+        panic!("gprc_port() is not available through the Gateway in cloud mode");
+    }
+
+    fn custom_request_host(&self) -> String {
+        // Code-first HTTP API deployments are reached via the apps base domain
+        // (*.apps.dev.golem.cloud), not through this host.
+        panic!("custom_request_host() is not available in cloud mode");
+    }
+
+    fn custom_request_port(&self) -> u16 {
+        // Code-first HTTP API deployments are reached via the apps base domain
+        // (*.apps.dev.golem.cloud), not through this port.
+        panic!("custom_request_port() is not available in cloud mode");
+    }
+
+    fn mcp_port(&self) -> u16 {
+        panic!("mcp_port() is not available in cloud mode");
+    }
+
+    async fn kill(&self) {}
+
+    async fn base_http_client(&self) -> reqwest_middleware::ClientWithMiddleware {
+        self.base_http_client
+            .get_or_init(|| async { new_cloud_reqwest_client() })
+            .await
+            .clone()
+    }
+
+    /// Overrides the trait default to use the configured API gateway URL
+    /// (including scheme/TLS), rather than rebuilding `http://{host}:{port}`.
+    async fn worker_http_client(&self, token: &TokenSecret) -> WorkerClientLive {
+        WorkerClientLive {
+            context: Context {
+                client: self.base_http_client().await,
+                base_url: self.api_url.clone(),
+                security_token: Security::Bearer(token.secret().to_string()),
+            },
+        }
+    }
+
+    /// Overrides the trait default to use the configured API gateway URL
+    /// (including scheme/TLS), rather than rebuilding `http://{host}:{port}`.
+    async fn agent_http_client(&self, token: &TokenSecret) -> AgentClientLive {
+        AgentClientLive {
+            context: Context {
+                client: self.base_http_client().await,
+                base_url: self.api_url.clone(),
+                security_token: Security::Bearer(token.secret().to_string()),
+            },
+        }
+    }
+}
diff --git a/golem-test-framework/src/components/worker_service/mod.rs b/golem-test-framework/src/components/worker_service/mod.rs
index 6885e86696..126cc988c9 100644
--- a/golem-test-framework/src/components/worker_service/mod.rs
+++ b/golem-test-framework/src/components/worker_service/mod.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+pub mod cloud;
 pub mod provided;
 pub mod spawned;
 
diff --git a/golem-test-framework/src/config/benchmark.rs b/golem-test-framework/src/config/benchmark.rs
index a1a304436b..34ac140d23 100644
--- a/golem-test-framework/src/config/benchmark.rs
+++ b/golem-test-framework/src/config/benchmark.rs
@@ -16,15 +16,21 @@ use crate::benchmark::BenchmarkConfig;
 use crate::components::component_compilation_service::ComponentCompilationService;
 use crate::components::component_compilation_service::provided::ProvidedComponentCompilationService;
 use crate::components::component_compilation_service::spawned::SpawnedComponentCompilationService;
+use crate::components::component_compilation_service::unavailable::UnavailableComponentCompilationService;
+use crate::components::rdb::PostgresInfo;
+use crate::components::rdb::Rdb;
 use crate::components::rdb::docker_postgres::DockerPostgresRdb;
 use crate::components::rdb::provided_postgres::ProvidedPostgresRdb;
-use crate::components::rdb::{PostgresInfo, Rdb};
+use crate::components::rdb::unavailable::UnavailableRdb;
 use crate::components::redis::Redis;
 use crate::components::redis::provided::ProvidedRedis;
 use crate::components::redis::spawned::SpawnedRedis;
+use crate::components::redis::unavailable::UnavailableRedis;
 use crate::components::redis_monitor::RedisMonitor;
 use crate::components::redis_monitor::spawned::SpawnedRedisMonitor;
+use crate::components::redis_monitor::unavailable::UnavailableRedisMonitor;
 use crate::components::registry_service::RegistryService;
+use crate::components::registry_service::cloud::CloudRegistryService;
 use crate::components::registry_service::provided::ProvidedRegistryService;
 use crate::components::registry_service::spawned::SpawnedRegistryService;
 use crate::components::service::Service;
@@ -32,10 +38,13 @@ use crate::components::service::spawned::SpawnedService;
 use crate::components::shard_manager::ShardManager;
 use crate::components::shard_manager::provided::ProvidedShardManager;
 use crate::components::shard_manager::spawned::SpawnedShardManager;
+use crate::components::shard_manager::unavailable::UnavailableShardManager;
 use crate::components::worker_executor_cluster::WorkerExecutorCluster;
 use crate::components::worker_executor_cluster::provided::ProvidedWorkerExecutorCluster;
 use crate::components::worker_executor_cluster::spawned::SpawnedWorkerExecutorCluster;
+use crate::components::worker_executor_cluster::unavailable::UnavailableWorkerExecutorCluster;
 use crate::components::worker_service::WorkerService;
+use crate::components::worker_service::cloud::CloudWorkerService;
 use crate::components::worker_service::provided::ProvidedWorkerService;
 use crate::components::worker_service::spawned::SpawnedWorkerService;
 use crate::config::TestDependencies;
@@ -51,11 +60,24 @@ use golem_service_base::storage::blob::BlobStorage;
 use golem_service_base::storage::blob::fs::FileSystemBlobStorage;
 use std::collections::HashMap;
 use std::path::{Path, PathBuf};
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
 use tempfile::TempDir;
 use tracing::Level;
+use url::Url;
 use uuid::Uuid;
 
+/// Process-level UUID generated on the first cloud-mode benchmark context
+/// creation. All cloud contexts within the same binary invocation share this
+/// run-id, which is used to prefix account/app/env names
+/// (`bench-{run_id}-…`) and written into result JSON metadata.
+static CLOUD_BENCH_RUN_ID: OnceLock<Uuid> = OnceLock::new();
+
+/// Returns the suite-level run-id if any cloud benchmark context has been
+/// created in this process, `None` otherwise.
+pub fn cloud_bench_run_id() -> Option<Uuid> {
+    CLOUD_BENCH_RUN_ID.get().copied()
+}
+
 /// Test dependencies created from command line arguments
 ///
 /// To be used when a single executable with an async entry point requires
@@ -75,6 +97,12 @@ pub struct BenchmarkTestDependencies {
     component_directory: PathBuf,
     component_temp_directory: Arc<TempDir>,
     registry_service: Arc<dyn RegistryService>,
+    /// Set to `Some` in cloud mode. Used to prefix account/app/env names with
+    /// `bench-{run_id}-` so that orphaned state is traceable.
+    run_id: Option<Uuid>,
+    /// The apps base domain for cloud mode (e.g. `apps.golem.cloud`). Used to
+    /// construct HTTP API deployment domains as `{env_id}.{apps_base_domain}`.
+    apps_base_domain: Option<String>,
 }
 
 #[derive(Parser, Debug, Clone)]
@@ -222,6 +250,58 @@ pub enum TestMode {
         #[arg(long, default_value = "test-components")]
         component_directory: String,
     },
+    /// Cloud mode: run benchmarks against a deployed Golem environment via
+    /// Gateway-API hostnames. No local service processes are spawned.
+    ///
+    /// All management API calls (registry-service, worker-service, agents) go
+    /// through a single Gateway hostname (`--api-url`). HTTP API deployment
+    /// access (code-first HTTP APIs) goes through `{env_id}.{apps_base_domain}`.
+    ///
+    /// For `golem-dev`:
+    ///   `--api-url https://release.dev-api.golem.cloud`
+    ///   `--apps-base-domain apps.dev.golem.cloud`
+    #[command()]
+    Cloud {
+        /// Base URL of the deployed Golem API Gateway. Both registry-service
+        /// and worker-service paths are routed internally by the Gateway.
+        ///
+        /// For the `golem-dev` environment this is
+        /// `https://release.dev-api.golem.cloud`.
+        #[arg(long)]
+        api_url: Url,
+        /// Wildcard base domain used to build per-environment HTTP API
+        /// deployment hostnames: `{env_id}.{apps_base_domain}`.
+        ///
+        /// For the `golem-dev` environment this is `apps.dev.golem.cloud`.
+        #[arg(long)]
+        apps_base_domain: String,
+        /// Bearer token for the admin account. Used to create a fresh user
+        /// account for each benchmark run, which then owns all benchmark state.
+        #[arg(long)]
+        admin_account_token: String,
+        /// UUID of the builtin-plugin-owner account.
+        /// Only needed for environment-plugin-grant tests; benchmarks do not
+        /// use it so the default (nil UUID) is fine for benchmark runs.
+        #[arg(long, default_value_t = Uuid::nil())]
+        builtin_plugin_owner_account_id: Uuid,
+        /// UUID of the default plan on the target cluster.
+        /// Only needed for environment-plugin-grant tests; benchmarks do not
+        /// use it so the default (nil UUID) is fine for benchmark runs.
+        #[arg(long, default_value_t = Uuid::nil())]
+        default_plan_id: Uuid,
+        /// Optional shard-manager gRPC hostname for a kubectl port-forward
+        /// (e.g. `localhost`). When set together with
+        /// `--shard-manager-grpc-port`, the throughput benchmark fetches the
+        /// routing table and labels RPC pairs as local/remote.
+        #[arg(long)]
+        shard_manager_grpc_host: Option<String>,
+        /// Optional shard-manager gRPC port (e.g. `9090`).
+        #[arg(long)]
+        shard_manager_grpc_port: Option<u16>,
+        /// Directory containing test WASM component files.
+        #[arg(long, default_value = "test-components")]
+        component_directory: String,
+    },
 }
 
 impl BenchmarkTestDependencies {
@@ -419,6 +499,8 @@ impl BenchmarkTestDependencies {
             initial_agent_files_service,
             component_temp_directory: Arc::new(TempDir::new().unwrap()),
             registry_service,
+            run_id: None,
+            apps_base_domain: None,
         }
     }
 
@@ -542,6 +624,8 @@ impl BenchmarkTestDependencies {
                     initial_agent_files_service,
                     component_temp_directory: Arc::new(TempDir::new().unwrap()),
                     registry_service,
+                    run_id: None,
+                    apps_base_domain: None,
                 }
             }
             TestMode::Spawned {
@@ -590,17 +674,93 @@ impl BenchmarkTestDependencies {
                 )
                 .await
             }
+            TestMode::Cloud {
+                api_url,
+                apps_base_domain,
+                admin_account_token,
+                builtin_plugin_owner_account_id,
+                default_plan_id,
+                shard_manager_grpc_host,
+                shard_manager_grpc_port,
+                component_directory,
+            } => {
+                let blob_storage = Arc::new(
+                    FileSystemBlobStorage::new(
+                        &std::env::temp_dir().join("golem-bench-blob-storage"),
+                    )
+                    .await
+                    .unwrap(),
+                );
+                let initial_agent_files_service =
+                    Arc::new(InitialAgentFilesService::new(blob_storage.clone()));
+
+                // Use the process-level run_id (shared across all cloud contexts in
+                // this process so all benchmarks in a suite carry the same run ID).
+                let run_id = *CLOUD_BENCH_RUN_ID.get_or_init(Uuid::new_v4);
+                tracing::info!("Cloud benchmark run_id: {run_id}");
+
+                // Both registry-service and worker-service are reachable via the
+                // same Gateway hostname; routing is path-based.
+                let registry_service: Arc<dyn RegistryService> =
+                    Arc::new(CloudRegistryService::new(
+                        api_url.clone(),
+                        TokenSecret::trusted(admin_account_token.clone()),
+                        AccountId(*builtin_plugin_owner_account_id),
+                        PlanId(*default_plan_id),
+                    ));
+
+                let shard_manager: Arc<dyn ShardManager> =
+                    match (shard_manager_grpc_host, shard_manager_grpc_port) {
+                        (Some(host), Some(port)) => {
+                            Arc::new(ProvidedShardManager::new(host.clone(), 0, *port))
+                        }
+                        _ => Arc::new(UnavailableShardManager),
+                    };
+
+                let worker_service: Arc<dyn WorkerService> =
+                    Arc::new(CloudWorkerService::new(api_url.clone()));
+
+                Self {
+                    rdb: Arc::new(UnavailableRdb),
+                    redis: Arc::new(UnavailableRedis),
+                    redis_monitor: Arc::new(UnavailableRedisMonitor),
+                    shard_manager,
+                    component_compilation_service: Arc::new(UnavailableComponentCompilationService),
+                    worker_service,
+                    worker_executor_cluster: Arc::new(UnavailableWorkerExecutorCluster),
+                    component_directory: Path::new(component_directory).to_path_buf(),
+                    blob_storage,
+                    initial_agent_files_service,
+                    component_temp_directory: Arc::new(TempDir::new().unwrap()),
+                    registry_service,
+                    run_id: Some(run_id),
+                    apps_base_domain: Some(apps_base_domain.clone()),
+                }
+            }
         }
     }
 
-    /// Checks if all the spawned dependencies are still running, and if not, panicks
+    /// Checks if all the spawned dependencies are still running, and if not, panics.
     ///
     /// This can be used as a checkpoint in benchmarks to avoid infinite retries.
+    /// In cloud mode this is a no-op — the cloud cluster is assumed to be
+    /// managed externally.
     pub async fn ensure_all_deps_running(&self) {
         if !self.worker_executor_cluster.is_running().await {
             panic!("Worker executor process(es) stopped");
         }
     }
+
+    /// Returns the run-id for this benchmark context, if running in cloud mode.
+    /// Used to prefix accounts/apps/envs with `bench-{run_id}-`.
+    pub fn run_id(&self) -> Option<Uuid> {
+        self.run_id
+    }
+
+    /// Returns the apps base domain for cloud mode (e.g. `apps.golem.cloud`).
+    pub fn apps_base_domain(&self) -> Option<&str> {
+        self.apps_base_domain.as_deref()
+    }
 }
 
 #[async_trait]
@@ -652,6 +812,10 @@ impl TestDependencies for BenchmarkTestDependencies {
     fn registry_service(&self) -> Arc<dyn RegistryService> {
         self.registry_service.clone()
     }
+
+    fn bench_name_prefix(&self) -> Option<String> {
+        self.run_id.map(|id| format!("bench-{id}-"))
+    }
 }
 
 #[allow(dead_code)]
diff --git a/golem-test-framework/src/config/dsl_impl.rs b/golem-test-framework/src/config/dsl_impl.rs
index b228a5235e..f2d5472175 100644
--- a/golem-test-framework/src/config/dsl_impl.rs
+++ b/golem-test-framework/src/config/dsl_impl.rs
@@ -883,8 +883,9 @@ impl<Deps: TestDependencies> TestDslExtended for TestUserContext<Deps> {
         environment_options: &EnvironmentOptions,
     ) -> anyhow::Result<(Application, Environment)> {
         let client = self.registry_service_client().await;
-        let app_name = ApplicationName(format!("app-{}", Uuid::new_v4()));
-        let env_name = EnvironmentName(format!("env-{}", Uuid::new_v4()));
+        let prefix = self.deps.bench_name_prefix().unwrap_or_default();
+        let app_name = ApplicationName(format!("{prefix}app-{}", Uuid::new_v4()));
+        let env_name = EnvironmentName(format!("{prefix}env-{}", Uuid::new_v4()));
 
         let application = client
             .create_application(
diff --git a/golem-test-framework/src/config/mod.rs b/golem-test-framework/src/config/mod.rs
index f5c14ace60..d8bdbe6b39 100644
--- a/golem-test-framework/src/config/mod.rs
+++ b/golem-test-framework/src/config/mod.rs
@@ -56,6 +56,13 @@ pub trait TestDependencies: Send + Sync + Clone {
     fn initial_agent_files_service(&self) -> Arc<InitialAgentFilesService>;
     fn registry_service(&self) -> Arc<dyn RegistryService>;
 
+    /// Returns an optional name prefix applied to benchmark-created accounts,
+    /// applications, and environments. Non-`None` in cloud mode, where the
+    /// prefix is `bench-{run_id}-` to make orphaned state traceable.
+    fn bench_name_prefix(&self) -> Option<String> {
+        None
+    }
+
     async fn admin(&self) -> TestUserContext<Self>
     where
         Self: Sized,
@@ -82,7 +89,12 @@ pub trait TestDependencies: Send + Sync + Clone {
             .client(&registry_service.admin_account_token())
             .await;
 
-        let name = Uuid::new_v4().to_string();
+        let uuid = Uuid::new_v4().to_string();
+        let name = if let Some(prefix) = self.bench_name_prefix() {
+            format!("{prefix}{uuid}")
+        } else {
+            uuid
+        };
         let account_data = AccountCreation {
             email: AccountEmail::new(format!("{name}@golem.cloud")),
             name,
diff --git a/golem-worker-executor-test-utils/src/lib.rs b/golem-worker-executor-test-utils/src/lib.rs
index ee81b41531..6d86ffb7a7 100644
--- a/golem-worker-executor-test-utils/src/lib.rs
+++ b/golem-worker-executor-test-utils/src/lib.rs
@@ -82,6 +82,7 @@ use golem_worker_executor::preview2::golem::agent::host::{
 };
 use golem_worker_executor::preview2::{golem_api_1_x, golem_durability};
 use golem_worker_executor::services::active_workers::ActiveWorkers;
+use golem_worker_executor::services::active_workers::memory_probe::FixedProbe;
 use golem_worker_executor::services::agent_types::AgentTypesService;
 use golem_worker_executor::services::agent_webhooks::AgentWebhooksService;
 use golem_worker_executor::services::blob_store::{
@@ -533,6 +534,16 @@ fn make_base_test_config(deps: &WorkerExecutorTestDependencies) -> GolemConfig {
         // without attempting a gRPC connection to a registry service that does
         // not exist in this test setup.
         resource_limits: ResourceLimitsConfig::Disabled(ResourceLimitsDisabledConfig {}),
+        // The measured-headroom admission gate requires the executor to own its
+        // memory environment (cgroup/process). The in-process test harness runs
+        // the executor alongside the test framework and other services, so the
+        // probe cannot isolate this executor's footprint. Disable the gate so
+        // admission always proceeds and tests are not subject to a memory limit
+        // derived from the shared host.
+        memory: MemoryConfig {
+            enable_measured_admission: false,
+            ..Default::default()
+        },
         ..Default::default()
     }
 }
@@ -696,6 +707,16 @@ pub async fn start_customized(
     apply_sqlite_storage_config(&mut config, deps, context);
     config.memory = MemoryConfig {
         system_memory_override,
+        // Enable the measured-headroom gate when a test pins a memory limit, so
+        // memory-pressure tests exercise the real admission controller under that
+        // limit. The test bootstrap (create_active_workers) feeds the gate a
+        // fixed probe reporting this limit with zero current usage, so admission
+        // is decided on the granted accounting against the pinned limit and is
+        // not perturbed by the shared test process's RSS. Otherwise the gate is
+        // disabled (see make_base_test_config). The usable ratio
+        // (worker_memory_ratio, default 0.8) applies, matching the pre-gate
+        // semaphore pool size of system_memory_override * ratio.
+        enable_measured_admission: system_memory_override.is_some(),
         ..Default::default()
     };
     config.filesystem_storage = FilesystemStorageConfig {
@@ -1358,6 +1379,31 @@ impl InvocationContextManagement for TestWorkerCtx {
 
 #[async_trait]
 impl Bootstrap<TestWorkerCtx> for TestServerBootstrap {
+    fn create_active_workers(
+        &self,
+        golem_config: &GolemConfig,
+    ) -> Arc<ActiveWorkers<TestWorkerCtx>> {
+        // The in-process test harness shares its process (and RSS) with the test
+        // framework and other services, so a process-RSS probe cannot isolate
+        // this executor's footprint. When a test pins a memory limit via
+        // system_memory_override, give the gate a fixed probe reporting that
+        // limit with zero current usage, so admission is decided solely on the
+        // granted accounting (exact and process-isolated) against the pinned
+        // limit. The usable_ratio (worker_memory_ratio) still applies, matching
+        // the pre-gate semaphore pool size of system_memory_override * ratio.
+        match golem_config.memory.system_memory_override {
+            Some(limit) => Arc::new(ActiveWorkers::new_with_probe(
+                Box::new(FixedProbe::new(limit, 0)),
+                &golem_config.memory,
+                &golem_config.filesystem_storage,
+            )),
+            None => Arc::new(ActiveWorkers::new(
+                &golem_config.memory,
+                &golem_config.filesystem_storage,
+            )),
+        }
+    }
+
     fn create_shard_manager_service(
         &self,
         _shard_manager_client: Arc<dyn golem_service_base::clients::shard_manager::ShardManager>,
diff --git a/golem-worker-executor/Cargo.toml b/golem-worker-executor/Cargo.toml
index bfe2dcafe8..9e83a12c1f 100644
--- a/golem-worker-executor/Cargo.toml
+++ b/golem-worker-executor/Cargo.toml
@@ -73,7 +73,10 @@ lazy_static = { workspace = true }
 log = { workspace = true }
 mac_address = { workspace = true, features = ["serde"] }
 md5 = { workspace = true }
+metrics = { workspace = true }
+metrics-exporter-prometheus = { workspace = true }
 metrohash = { workspace = true }
+mimalloc = { workspace = true }
 nonempty-collections = { workspace = true }
 nonzero_ext = { workspace = true }
 pgvector = { workspace = true }
@@ -92,6 +95,7 @@ sqlx-core = { workspace = true }
 sysinfo = { workspace = true }
 tempfile = { workspace = true }
 tokio = { workspace = true }
+tokio-metrics = { workspace = true }
 tokio-stream = { workspace = true }
 tokio-tungstenite = { workspace = true }
 tokio-util = { workspace = true }
diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env
index 2a52884966..35725ab38f 100644
--- a/golem-worker-executor/config/worker-executor.sample.env
+++ b/golem-worker-executor/config/worker-executor.sample.env
@@ -4,6 +4,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0"
 GOLEM__HTTP_PORT=8082
 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s"
 GOLEM__MAX_WEBSOCKET_CONNECTIONS=100
+GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s"
 GOLEM__TRACING_FILE_NAME_WITH_PORT=true
 GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25
 GOLEM__ACTIVE_WORKERS__TTL="8h"
@@ -72,6 +73,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
+GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
@@ -213,6 +216,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0"
 GOLEM__HTTP_PORT=8082
 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s"
 GOLEM__MAX_WEBSOCKET_CONNECTIONS=100
+GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s"
 GOLEM__TRACING_FILE_NAME_WITH_PORT=true
 GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25
 GOLEM__ACTIVE_WORKERS__TTL="8h"
@@ -291,6 +295,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
+GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
@@ -432,6 +438,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0"
 GOLEM__HTTP_PORT=8082
 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s"
 GOLEM__MAX_WEBSOCKET_CONNECTIONS=100
+GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s"
 GOLEM__TRACING_FILE_NAME_WITH_PORT=true
 GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25
 GOLEM__ACTIVE_WORKERS__TTL="8h"
@@ -480,6 +487,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
+GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
+GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
 GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml
index 4c89275519..df58d45b50 100644
--- a/golem-worker-executor/config/worker-executor.toml
+++ b/golem-worker-executor/config/worker-executor.toml
@@ -3,6 +3,7 @@ http_address = "0.0.0.0"
 http_port = 8082
 max_in_function_retry_delay = "20s"
 max_websocket_connections = 100
+runtime_metrics_sampling_interval = "5s"
 tracing_file_name_with_port = true
 
 [active_workers]
@@ -125,6 +126,8 @@ max_oplog_query_pages_size = 100
 
 [memory]
 acquire_retry_delay = "500ms"
+component_size_coefficient = 2.0
+enable_measured_admission = true
 worker_estimate_coefficient = 1.1
 worker_memory_ratio = 0.8
 
@@ -331,6 +334,7 @@ without_time = false
 # http_port = 8082
 # max_in_function_retry_delay = "20s"
 # max_websocket_connections = 100
+# runtime_metrics_sampling_interval = "5s"
 # tracing_file_name_with_port = true
 # 
 # [active_workers]
@@ -456,6 +460,8 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
+# component_size_coefficient = 2.0
+# enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
 # 
@@ -661,6 +667,7 @@ without_time = false
 # http_port = 8082
 # max_in_function_retry_delay = "20s"
 # max_websocket_connections = 100
+# runtime_metrics_sampling_interval = "5s"
 # tracing_file_name_with_port = true
 # 
 # [active_workers]
@@ -757,6 +764,8 @@ without_time = false
 # 
 # [memory]
 # acquire_retry_delay = "500ms"
+# component_size_coefficient = 2.0
+# enable_measured_admission = true
 # worker_estimate_coefficient = 1.1
 # worker_memory_ratio = 0.8
 # 
diff --git a/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt b/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt
new file mode 100644
index 0000000000..eb12d21790
--- /dev/null
+++ b/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt
@@ -0,0 +1,9 @@
+# Seeds for failure cases proptest has generated in the past. It is
+# automatically read and these particular cases re-run before any
+# novel cases are generated.
+#
+# It is recommended to check this file in to source control so that
+# everyone who runs the test benefits from these saved cases.
+cc b49eb145c9dca28d347382d8e482bb2cb6c5d256ccaba7532b370fbadc2bb3fb # shrinks to (limit, residents) = (500, []), schedule = [Admit(220), Admit(92), Admit(189)]
+cc 9727f7e7aab54f8f48e6b856f9d70428fd8503767677fa7c232e27263273e071 # shrinks to limit = 815, schedule = [Grant(485), Grant(1), Grant(7), Exit(1), Grant(1), FaultIn(2, 1), Grant(40), Exit(2), Grant(284)]
+cc 41321d47abd75b283d651e63e40c0f5191b680b908c05879c02d5f36b70de66c # shrinks to (limit, residents) = (1369, [Resident { size: 144, priority: Idle }, Resident { size: 228, priority: Warm }, Resident { size: 152, priority: Warm }, Resident { size: 101, priority: Idle }, Resident { size: 68, priority: Warm }, Resident { size: 45, priority: Idle }, Resident { size: 30, priority: Idle }, Resident { size: 20, priority: Idle }, Resident { size: 13, priority: Warm }, Resident { size: 9, priority: Idle }, Resident { size: 6, priority: Idle }]), schedule = [Admit(270), Admit(785), Admit(250), Admit(146), Admit(456)]
diff --git a/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt b/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt
new file mode 100644
index 0000000000..5845bf0e72
--- /dev/null
+++ b/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt
@@ -0,0 +1,7 @@
+# Seeds for failure cases proptest has generated in the past. It is
+# automatically read and these particular cases re-run before any
+# novel cases are generated.
+#
+# It is recommended to check this file in to source control so that
+# everyone who runs the test benefits from these saved cases.
+cc 25407766c98e9d718173e44b5321f97049eea6d6d7737aad80a937d7230d67d9 # shrinks to limit = 1, ops = [Acquire, Acquire, CancelPending(Index(423873604949)), Acquire, ReleaseThenCancel(Index(2899867607303593255), Index(13233034632676646474))]
diff --git a/golem-worker-executor/src/identity.rs b/golem-worker-executor/src/identity.rs
new file mode 100644
index 0000000000..e2f95b0cae
--- /dev/null
+++ b/golem-worker-executor/src/identity.rs
@@ -0,0 +1,32 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Process/pod identity for this worker-executor instance.
+//!
+//! The identity is derived from the `POD_NAME` env var, falling back to
+//! `HOSTNAME`, then `"unknown"`, resolved once and cached for the lifetime of
+//! the process. It is used both as the `executor_id` metric label and anywhere
+//! else the running instance needs to identify itself.
+
+/// Returns the stable identity of this worker-executor instance.
+///
+/// Resolved once on first call and cached for the lifetime of the process.
+pub fn executor_id() -> &'static str {
+    static EXECUTOR_ID: std::sync::OnceLock<String> = std::sync::OnceLock::new();
+    EXECUTOR_ID.get_or_init(|| {
+        std::env::var("POD_NAME")
+            .or_else(|_| std::env::var("HOSTNAME"))
+            .unwrap_or_else(|_| "unknown".to_string())
+    })
+}
diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs
index a62f944cf0..b9ecc4f640 100644
--- a/golem-worker-executor/src/lib.rs
+++ b/golem-worker-executor/src/lib.rs
@@ -16,6 +16,7 @@ pub mod bootstrap;
 pub mod config;
 pub mod durable_host;
 pub mod grpc;
+pub mod identity;
 pub mod metrics;
 pub mod model;
 pub mod preview2;
@@ -161,6 +162,18 @@ impl Drop for RunDetails {
 #[async_trait]
 #[allow(clippy::too_many_arguments)]
 pub trait Bootstrap<Ctx: WorkerCtx> {
+    /// Creates the [`ActiveWorkers`] service, including the measured-headroom
+    /// admission gate. The default builds the memory probe from the config
+    /// (cgroup/process/override). The in-process test harness overrides this to
+    /// inject a probe with a pinned limit and usage so the gate is deterministic
+    /// and isolated from the shared test process's RSS.
+    fn create_active_workers(&self, golem_config: &GolemConfig) -> Arc<ActiveWorkers<Ctx>> {
+        Arc::new(ActiveWorkers::<Ctx>::new(
+            &golem_config.memory,
+            &golem_config.filesystem_storage,
+        ))
+    }
+
     fn create_shard_manager_service(
         &self,
         shard_manager_client: Arc<dyn golem_service_base::clients::shard_manager::ShardManager>,
@@ -769,10 +782,7 @@ pub async fn create_worker_executor_impl<
         }
     };
 
-    let active_workers = Arc::new(ActiveWorkers::<Ctx>::new(
-        &golem_config.memory,
-        &golem_config.filesystem_storage,
-    ));
+    let active_workers = bootstrap.create_active_workers(&golem_config);
 
     let file_loader = Arc::new(FileLoader::new(
         initial_files_service.clone(),
@@ -1000,13 +1010,18 @@ pub async fn bootstrap_and_run_worker_executor<
 ) -> anyhow::Result<RunDetails> {
     debug!("Initializing worker executor");
 
-    let total_system_memory = golem_config.memory.total_system_memory();
-    let system_memory = golem_config.memory.system_memory();
-    let worker_memory = golem_config.memory.worker_memory();
+    let memory_snapshot = crate::services::active_workers::memory_probe::default_probe(
+        golem_config.memory.system_memory_override,
+    )
+    .snapshot();
+    let total_system_memory = memory_snapshot.limit_bytes;
+    let used_system_memory = memory_snapshot.current_bytes;
+    let worker_memory =
+        (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64;
     info!(
-        "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}",
+        "Measured memory limit: {}, Currently used: {}, Usable for workers: {}",
         ISizeFormatter::new(total_system_memory, BINARY),
-        ISizeFormatter::new(system_memory, BINARY),
+        ISizeFormatter::new(used_system_memory, BINARY),
         ISizeFormatter::new(worker_memory, BINARY)
     );
 
@@ -1047,11 +1062,18 @@ pub async fn bootstrap_and_run_worker_executor<
 
     let leak_detector = worker_executor_impl.leak_detector();
 
+    let runtime_metrics = crate::metrics::runtime::install_runtime_metrics(
+        runtime.clone(),
+        golem_config.runtime_metrics_sampling_interval,
+        join_set,
+    );
+
     let grpc_port = run_grpc_server(worker_executor_impl, lazy_worker_activator, join_set).await?;
 
-    let http_port = golem_service_base::observability::start_health_and_metrics_server(
+    let http_port = golem_service_base::observability::start_health_and_metrics_server_with_extra(
         golem_config.http_addr()?,
         prometheus_registry,
+        runtime_metrics,
         "Worker executor is running",
         join_set,
     )
diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs
index 768b3e6b98..980ae3842d 100644
--- a/golem-worker-executor/src/metrics.rs
+++ b/golem-worker-executor/src/metrics.rs
@@ -69,18 +69,26 @@ const SCHEDULER_LAG_BUCKETS: &[f64; 11] = &[
     0.001, 0.01, 0.1, 1.0, 5.0, 15.0, 30.0, 60.0, 120.0, 300.0, 600.0,
 ];
 
-const MEMORY_SIZE_BUCKETS: &[f64; 11] = &[
-    1024.0,
-    4096.0,
-    16384.0,
-    65536.0,
-    262144.0,
-    1048576.0,
-    4194304.0,
-    16777216.0,
-    67108864.0,
-    268435456.0,
-    1073741824.0,
+/// Buckets for the size of a single `memory.grow` allocation. Deliberately
+/// fine-grained in the 1-32 MiB band where typical guest grows cluster, so
+/// that p90/p99 quantiles are not pinned to a coarse 4-16 MiB bucket edge.
+const MEMORY_SIZE_BUCKETS: &[f64; 16] = &[
+    65536.0,      // 64 KiB
+    262144.0,     // 256 KiB
+    1048576.0,    // 1 MiB
+    2097152.0,    // 2 MiB
+    4194304.0,    // 4 MiB
+    6291456.0,    // 6 MiB
+    8388608.0,    // 8 MiB
+    12582912.0,   // 12 MiB
+    16777216.0,   // 16 MiB
+    25165824.0,   // 24 MiB
+    33554432.0,   // 32 MiB
+    67108864.0,   // 64 MiB
+    134217728.0,  // 128 MiB
+    268435456.0,  // 256 MiB
+    536870912.0,  // 512 MiB
+    1073741824.0, // 1 GiB
 ];
 
 pub mod component {
@@ -105,6 +113,83 @@ pub mod component {
     }
 }
 
+pub mod runtime {
+    use std::sync::Arc;
+    use std::time::Duration;
+
+    use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle};
+    use tokio::runtime::Handle;
+    use tokio::task::JoinSet;
+    use tokio_metrics::RuntimeMetricsReporterBuilder;
+
+    /// How often the recorder's upkeep runs to keep its internal storage
+    /// bounded (e.g. pruning idle metrics once an idle timeout is configured).
+    const UPKEEP_INTERVAL: Duration = Duration::from_secs(30);
+
+    /// Installs a dedicated `metrics`-crate Prometheus recorder for tokio
+    /// runtime metrics, spawns the tokio-metrics reporter on `join_set`, and
+    /// returns a renderer that emits the collected metrics in Prometheus text
+    /// format.
+    ///
+    /// `sampling_interval` controls how often metrics are sampled from the
+    /// runtime into the recorder; Prometheus scrapes the rendered values
+    /// independently.
+    ///
+    /// The returned closure is appended to the `prometheus`-crate scrape output
+    /// on the shared `/metrics` endpoint, so all `tokio_*` series appear on the
+    /// same endpoint as the rest of the executor's metrics, carrying the same
+    /// `executor_id` label.
+    ///
+    /// Returns `None` if a global `metrics` recorder is already installed (which
+    /// should not happen in the executor), in which case runtime metrics are
+    /// simply not exported.
+    pub fn install_runtime_metrics(
+        runtime: Handle,
+        sampling_interval: Duration,
+        join_set: &mut JoinSet<anyhow::Result<()>>,
+    ) -> Option<Arc<dyn Fn() -> String + Send + Sync>> {
+        let executor_id = crate::identity::executor_id();
+
+        let handle: PrometheusHandle = match PrometheusBuilder::new()
+            .add_global_label("executor_id", executor_id)
+            .install_recorder()
+        {
+            Ok(handle) => handle,
+            Err(err) => {
+                tracing::warn!(
+                    "Failed to install tokio runtime metrics recorder, runtime metrics will not be exported: {err}"
+                );
+                return None;
+            }
+        };
+
+        let reporter = RuntimeMetricsReporterBuilder::default().with_interval(sampling_interval);
+        join_set.spawn_on(
+            async move {
+                reporter.describe_and_run().await;
+                Ok(())
+            },
+            &runtime,
+        );
+
+        // Run periodic upkeep so the recorder's internal storage stays bounded.
+        let upkeep_handle = handle.clone();
+        join_set.spawn_on(
+            async move {
+                let mut interval = tokio::time::interval(UPKEEP_INTERVAL);
+                interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+                loop {
+                    interval.tick().await;
+                    upkeep_handle.run_upkeep();
+                }
+            },
+            &runtime,
+        );
+
+        Some(Arc::new(move || handle.render()))
+    }
+}
+
 pub mod events {
     use lazy_static::lazy_static;
     use prometheus::*;
@@ -182,6 +267,61 @@ pub mod workers {
             crate::metrics::BLOB_SIZE_BUCKETS.to_vec()
         )
         .unwrap();
+        pub static ref WORKER_MEMORY_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!(
+            "golem_worker_memory_pool_total_bytes",
+            "Usable memory ceiling (usable_ratio * measured limit) the admission gate admits against on this executor",
+            &["executor_id"]
+        )
+        .unwrap();
+        pub static ref WORKER_MEMORY_POOL_USED_BYTES: GaugeVec = register_gauge_vec!(
+            "golem_worker_memory_pool_used_bytes",
+            "Total linear memory granted to live workers and reserved by the admission gate on this executor",
+            &["executor_id"]
+        )
+        .unwrap();
+        pub static ref WORKER_ADMISSION_RSS_BYTES: GaugeVec = register_gauge_vec!(
+            "golem_worker_admission_rss_bytes",
+            "Measured resident memory (probe snapshot) the admission gate last read on this executor",
+            &["executor_id"]
+        )
+        .unwrap();
+        pub static ref WORKER_MEMORY_GROW_REJECTED_TOTAL: CounterVec = register_counter_vec!(
+            "golem_worker_memory_grow_rejected_total",
+            "Invocations interrupted because a worker's linear-memory grow could not be admitted by the gate (out-of-memory trap, retried via reacquire)",
+            &["executor_id"]
+        )
+        .unwrap();
+    }
+
+    /// Counts one invocation interrupted because a linear-memory grow was
+    /// refused by the admission gate (the worker traps out-of-memory and is
+    /// restarted to reacquire memory).
+    pub fn record_worker_memory_grow_rejected() {
+        WORKER_MEMORY_GROW_REJECTED_TOTAL
+            .with_label_values(&[crate::metrics::storage::executor_id()])
+            .inc();
+    }
+
+    /// Sets the gate's usable memory ceiling gauge.
+    pub fn record_worker_memory_ceiling(bytes: u64) {
+        WORKER_MEMORY_POOL_TOTAL_BYTES
+            .with_label_values(&[crate::metrics::storage::executor_id()])
+            .set(bytes as f64);
+    }
+
+    /// Sets the gauge of total memory granted to live workers (the gate's
+    /// reservation).
+    pub fn record_worker_memory_granted(bytes: u64) {
+        WORKER_MEMORY_POOL_USED_BYTES
+            .with_label_values(&[crate::metrics::storage::executor_id()])
+            .set(bytes as f64);
+    }
+
+    /// Sets the gauge of measured resident memory last read by the gate.
+    pub fn record_worker_admission_rss(bytes: u64) {
+        WORKER_ADMISSION_RSS_BYTES
+            .with_label_values(&[crate::metrics::storage::executor_id()])
+            .set(bytes as f64);
     }
 
     pub fn record_worker_call(api_name: &'static str) {
@@ -221,6 +361,9 @@ pub mod workers {
         WORKER_WAITING_FOR_MEMORY_COUNT
             .with_label_values(&[id])
             .set(0.0);
+        WORKER_MEMORY_GROW_REJECTED_TOTAL
+            .with_label_values(&[id])
+            .inc_by(0.0);
     }
 
     pub fn inc_worker_memory_resident() {
@@ -294,18 +437,6 @@ pub mod workers {
         WORKER_FILESYSTEM_SEMAPHORE_AVAILABLE.add(permits.into_f64());
     }
 
-    /// Records acquisition of `bytes` from the worker-memory pool.
-    /// Updates `golem_worker_memory_pool_used_bytes{executor_id}`.
-    pub fn record_memory_permit_acquired(bytes: usize) {
-        crate::metrics::storage::record_worker_memory_pool_acquired(bytes as u64);
-    }
-
-    /// Records release of `bytes` back to the worker-memory pool.
-    /// Updates `golem_worker_memory_pool_used_bytes{executor_id}`.
-    pub fn record_memory_permit_released(bytes: usize) {
-        crate::metrics::storage::record_worker_memory_pool_released(bytes as u64);
-    }
-
     pub fn record_worker_kv_cache_value_size(bytes: usize) {
         WORKER_KV_CACHE_VALUE_SIZE_BYTES
             .with_label_values(&[crate::metrics::storage::executor_id()])
@@ -504,7 +635,13 @@ pub mod wasm {
         .unwrap();
         static ref ALLOCATED_MEMORY_BYTES: Histogram = register_histogram!(
             "allocated_memory_bytes",
-            "Amount of memory allocated by a single memory.grow instruction",
+            "Worker's total linear memory size after a memory.grow, sampled at each grow",
+            crate::metrics::MEMORY_SIZE_BUCKETS.to_vec()
+        )
+        .unwrap();
+        static ref WORKER_RESIDENT_LINEAR_MEMORY_BYTES: Histogram = register_histogram!(
+            "worker_resident_linear_memory_bytes",
+            "Per-worker cumulative linear-memory grant (total_linear_memory_size = sum of memory.grow deltas) sampled when the worker is admitted. This is the linear memory the admission gate reserves for the worker; it is an upper bound on resident RSS, not measured resident memory, since grown pages are largely demand-paged. Compare to container_memory_working_set_bytes for the gap.",
             crate::metrics::MEMORY_SIZE_BUCKETS.to_vec()
         )
         .unwrap();
@@ -580,6 +717,10 @@ pub mod wasm {
     pub fn record_allocated_memory(amount: usize) {
         ALLOCATED_MEMORY_BYTES.observe(amount as f64);
     }
+
+    pub fn record_worker_resident_linear_memory(bytes: u64) {
+        WORKER_RESIDENT_LINEAR_MEMORY_BYTES.observe(bytes as f64);
+    }
 }
 
 pub mod oplog {
@@ -717,16 +858,10 @@ pub mod storage {
     use lazy_static::lazy_static;
     use prometheus::*;
 
-    /// Returns the executor identity label: POD_NAME env var, falling back to HOSTNAME, then "unknown".
-    /// Resolved once on first call and cached for the lifetime of the process.
-    pub fn executor_id() -> &'static str {
-        static EXECUTOR_ID: std::sync::OnceLock<String> = std::sync::OnceLock::new();
-        EXECUTOR_ID.get_or_init(|| {
-            std::env::var("POD_NAME")
-                .or_else(|_| std::env::var("HOSTNAME"))
-                .unwrap_or_else(|_| "unknown".to_string())
-        })
-    }
+    /// Re-exported from [`crate::identity`], which owns the process identity.
+    /// Kept here so existing metric-recording call sites can keep using
+    /// `crate::metrics::storage::executor_id()`.
+    pub use crate::identity::executor_id;
 
     lazy_static! {
         pub static ref STORAGE_FILESYSTEM_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!(
@@ -741,18 +876,6 @@ pub mod storage {
             &["executor_id"]
         )
         .unwrap();
-        pub static ref WORKER_MEMORY_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!(
-            "golem_worker_memory_pool_total_bytes",
-            "Configured worker-memory semaphore size in bytes for this executor",
-            &["executor_id"]
-        )
-        .unwrap();
-        pub static ref WORKER_MEMORY_POOL_USED_BYTES: GaugeVec = register_gauge_vec!(
-            "golem_worker_memory_pool_used_bytes",
-            "Bytes currently acquired from the worker-memory semaphore on this executor",
-            &["executor_id"]
-        )
-        .unwrap();
     }
 
     pub fn record_filesystem_pool_total(bytes: u64) {
@@ -772,22 +895,4 @@ pub mod storage {
             .with_label_values(&[executor_id()])
             .sub(bytes as f64);
     }
-
-    pub fn record_worker_memory_pool_total(bytes: u64) {
-        WORKER_MEMORY_POOL_TOTAL_BYTES
-            .with_label_values(&[executor_id()])
-            .set(bytes as f64);
-    }
-
-    pub fn record_worker_memory_pool_acquired(bytes: u64) {
-        WORKER_MEMORY_POOL_USED_BYTES
-            .with_label_values(&[executor_id()])
-            .add(bytes as f64);
-    }
-
-    pub fn record_worker_memory_pool_released(bytes: u64) {
-        WORKER_MEMORY_POOL_USED_BYTES
-            .with_label_values(&[executor_id()])
-            .sub(bytes as f64);
-    }
 }
diff --git a/golem-worker-executor/src/server.rs b/golem-worker-executor/src/server.rs
index fbd1c7e60c..18b286adcb 100644
--- a/golem-worker-executor/src/server.rs
+++ b/golem-worker-executor/src/server.rs
@@ -21,6 +21,9 @@ use std::sync::Arc;
 use tokio::task::JoinSet;
 use tracing::info;
 
+#[global_allocator]
+static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
 fn main() -> Result<(), anyhow::Error> {
     match make_config_loader().load_or_dump_config() {
         Some(mut config) => {
diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs
new file mode 100644
index 0000000000..ec57acd699
--- /dev/null
+++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs
@@ -0,0 +1,342 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Measured-headroom admission decision.
+//!
+//! Gates worker admission on the executor environment's memory headroom. It is
+//! the sole admission authority: there is no estimate-based semaphore behind it.
+//!
+//! The gate weighs two quantities against the usable ceiling:
+//!
+//! * Measured RSS from the [`MemoryProbe`] (cgroup `memory.current` on a
+//!   constrained pod) — what is resident right now.
+//! * The total linear memory *granted* to live workers — what they could fault
+//!   in at any moment.
+//!
+//! Both matter because they fail in opposite directions. Measured RSS lags
+//! admission: `memory.current` counts only touched pages, so a worker admitted
+//! moments ago is not yet resident and a burst admitted against the same low
+//! snapshot would collectively over-commit. The granted total leads residency: a
+//! worker can fault in any page of the virtual memory it was already granted at
+//! any later time, with no admission call to intercept it, so a gate that
+//! reserved only what is resident would let a node full of lightly-touched
+//! workers OOM by writing into memory they already hold. The gate therefore
+//! reserves the full granted total from admission until unload, and admits
+//! against the *larger* of measured RSS and that granted total — safe against
+//! both the burst race and later faulting of granted pages.
+//!
+//! The granted total is maintained by two integer updates: a worker's grant is
+//! added on admission, and removed when the [`MemoryGrant`] guard returned by
+//! admission is dropped. Tying the removal to the guard's drop — rather than to
+//! an explicit release call on some worker-lifecycle path — keeps the accounting
+//! symmetric no matter how a worker's start ends: whether it becomes resident and
+//! later stops, or its start is cancelled mid-flight (e.g. the worker is deleted
+//! while still waiting for permits), dropping the guard returns its reservation
+//! exactly once. The headroom check re-derives the reservation from the
+//! maintained total and the current probe reading, so it is O(1) and exact
+//! regardless of worker churn.
+//!
+//! When headroom is short the controller evicts already-resident idle-then-warm
+//! work; if it still cannot make room it rejects rather than over-committing.
+//!
+//! The controller is decoupled from `Worker`/wasmtime via the [`EvictionSource`]
+//! trait so its decision logic can be exercised in isolation with synthetic
+//! probes and candidate sets.
+
+use super::memory_probe::MemoryProbe;
+use async_trait::async_trait;
+use std::sync::{Arc, Mutex};
+
+/// Why an eviction candidate is worth evicting, in priority order. Lower
+/// variants are evicted first.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub(crate) enum EvictionPriority {
+    /// Resident in memory, not executing, no durable pending work. Cheapest to
+    /// evict — losing it costs at most a re-load on next use.
+    Idle,
+    /// Resident in memory, not executing, but has durable pending work. Evicted
+    /// only after all idle candidates are exhausted.
+    Warm,
+}
+
+/// A source of evictable, already-resident memory the controller can reclaim to
+/// restore headroom. Abstracts over the live worker set so the decision logic
+/// is testable without `Worker`/wasmtime.
+#[async_trait]
+pub(crate) trait EvictionSource: Send + Sync {
+    /// Evict at the given priority tier, attempting to free at least
+    /// `needed_bytes`. Returns the number of bytes actually reclaimed (which may
+    /// be less if the tier is exhausted, or more if a single victim was larger
+    /// than needed). Must not evict from a higher (more expensive) tier than the
+    /// one requested.
+    async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64;
+}
+
+/// The outcome of an admission attempt.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum AdmissionDecision {
+    /// There is enough real headroom (possibly after eviction) to admit the
+    /// request without risking the limit.
+    Admit,
+    /// Not enough headroom could be freed; the request must back off rather
+    /// than over-commit.
+    Reject,
+}
+
+/// Configuration for the headroom-based admission decision.
+///
+/// * `usable_ratio` — fraction of the measured limit usable for WASM admission.
+///   The remainder is left for the host (the executor process, allocator
+///   arenas, runtime buffers). Mirrors `worker_memory_ratio`, but applied to the
+///   measured limit rather than the configured total.
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct AdmissionPolicy {
+    /// Fraction (0.0..=1.0) of the measured limit usable for WASM admission.
+    pub usable_ratio: f64,
+}
+
+/// Decides admission against measured headroom, evicting resident idle/warm
+/// work as needed. Holds its policy and probe; live usage is read fresh from the
+/// probe on each call. The only retained state is `granted`: the total linear
+/// memory granted to live workers, maintained across admit and unload, which the
+/// gate reserves so a worker cannot OOM the node by faulting in granted pages.
+pub(crate) struct AdmissionController {
+    probe: Box<dyn MemoryProbe>,
+    policy: AdmissionPolicy,
+    granted: Mutex<u64>,
+}
+
+impl AdmissionController {
+    pub fn new(probe: Box<dyn MemoryProbe>, policy: AdmissionPolicy) -> Self {
+        let ceiling = (probe.snapshot().limit_bytes as f64 * policy.usable_ratio) as u64;
+        crate::metrics::workers::record_worker_memory_ceiling(ceiling);
+        Self {
+            probe,
+            policy,
+            granted: Mutex::new(0),
+        }
+    }
+
+    /// Bytes available for a new admission: the usable ceiling minus the larger
+    /// of measured RSS and the total memory granted to live workers. Saturating —
+    /// never underflows when already over the ceiling.
+    ///
+    /// A worker can fault in any page of the virtual memory it was granted at any
+    /// time, with no admission call to intercept it, so the gate must reserve the
+    /// full granted total even before it is resident. Measured RSS is only larger
+    /// than the granted total transiently (host/runtime overhead the grant does
+    /// not cover), so taking the maximum keeps the gate safe against both the
+    /// grant a worker may yet fault in and any usage the grant does not capture.
+    fn admissible_headroom(&self) -> u64 {
+        let granted = *self.granted.lock().unwrap();
+        self.headroom_with_granted(granted)
+    }
+
+    /// Computes admissible headroom for an already-read `granted` value. Reads
+    /// the probe and emits the ceiling/RSS metrics. Kept separate from the lock
+    /// acquisition so the decision-and-reserve sequence can hold the lock across
+    /// both steps (see [`Self::try_reserve_locked`]).
+    fn headroom_with_granted(&self, granted: u64) -> u64 {
+        let snapshot = self.probe.snapshot();
+        let ceiling = (snapshot.limit_bytes as f64 * self.policy.usable_ratio) as u64;
+        crate::metrics::workers::record_worker_memory_ceiling(ceiling);
+        crate::metrics::workers::record_worker_admission_rss(snapshot.current_bytes);
+        ceiling.saturating_sub(snapshot.current_bytes.max(granted))
+    }
+
+    /// Atomically admits `request_bytes` if the headroom computed against the
+    /// current granted total covers it: reads `granted`, computes headroom, and
+    /// adds the reservation all under one lock so two concurrent admissions
+    /// cannot both pass the check against the same headroom and overshoot the
+    /// ceiling. Returns whether the request was admitted.
+    fn try_reserve_locked(&self, request_bytes: u64) -> bool {
+        let mut granted = self.granted.lock().unwrap();
+        if self.headroom_with_granted(*granted) >= request_bytes {
+            *granted += request_bytes;
+            crate::metrics::workers::record_worker_memory_granted(*granted);
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Record `request_bytes` of memory granted to a newly admitted worker. The
+    /// gate reserves this until the worker unloads, because the worker may fault
+    /// the granted pages in at any later time.
+    fn reserve(&self, request_bytes: u64) {
+        let mut granted = self.granted.lock().unwrap();
+        *granted += request_bytes;
+        crate::metrics::workers::record_worker_memory_granted(*granted);
+    }
+
+    /// Reserve memory for a cost that is a committed consequence of an already
+    /// admitted worker rather than a fresh admission — currently a component's
+    /// compiled module, loaded into RAM when the first worker of the component
+    /// becomes resident and shared by all its workers. Unlike admission this does
+    /// not evict or reject (the worker is already in); it accounts the bytes so
+    /// later admissions see them. Released with [`Self::release`].
+    pub(crate) fn reserve_committed(&self, bytes: u64) {
+        self.reserve(bytes);
+    }
+
+    /// Release the grant of a worker that has unloaded, given the bytes it was
+    /// granted. Its pages leave memory, so its grant no longer needs reserving;
+    /// not releasing it would permanently shrink admissible headroom as workers
+    /// come and go.
+    pub(crate) fn release(&self, reserved_bytes: u64) {
+        let mut granted = self.granted.lock().unwrap();
+        *granted = granted.saturating_sub(reserved_bytes);
+        crate::metrics::workers::record_worker_memory_granted(*granted);
+    }
+
+    /// Pre-register grant bytes for workers that were already live when the
+    /// controller was created. Test-only: production registers every worker's
+    /// grant through admission.
+    #[cfg(test)]
+    pub fn seed_granted(&self, bytes: u64) {
+        *self.granted.lock().unwrap() += bytes;
+    }
+
+    /// Decide whether `request_bytes` can be admitted, evicting from `source` if
+    /// the current headroom is insufficient.
+    ///
+    /// Eviction is attempted idle-first, then warm, and only up to the shortfall
+    /// (never evicts when headroom already suffices). After eviction the
+    /// headroom is re-measured against ground truth; the request is admitted only
+    /// if the real headroom now covers it, otherwise it is rejected. On admit the
+    /// request is added to the in-flight reservation.
+    async fn try_admit(
+        &self,
+        request_bytes: u64,
+        source: &dyn EvictionSource,
+    ) -> AdmissionDecision {
+        // Fast path: atomically admit if there is already enough real headroom.
+        if self.try_reserve_locked(request_bytes) {
+            return AdmissionDecision::Admit;
+        }
+
+        // Reclaim resident, idle-then-warm work up to the shortfall.
+        let shortfall = request_bytes.saturating_sub(self.admissible_headroom());
+        let mut remaining = shortfall;
+
+        for priority in [EvictionPriority::Idle, EvictionPriority::Warm] {
+            if remaining == 0 {
+                break;
+            }
+            let freed = source.evict_at_most(priority, remaining).await;
+            remaining = remaining.saturating_sub(freed);
+        }
+
+        // Re-measure against ground truth rather than trusting the freed tally:
+        // the probe is the authority, and other activity may have moved usage
+        // in either direction while we were evicting. The check-and-reserve is
+        // atomic so a concurrent admission cannot slip in between.
+        if self.try_reserve_locked(request_bytes) {
+            AdmissionDecision::Admit
+        } else {
+            AdmissionDecision::Reject
+        }
+    }
+
+    /// The current admissible headroom. Used by tests to assert the gate's
+    /// accounting; production reads headroom indirectly through admission.
+    #[cfg(test)]
+    pub(crate) fn headroom_bytes(&self) -> u64 {
+        self.admissible_headroom()
+    }
+
+    /// Admit `request_bytes`, evicting resident idle-then-warm work if needed,
+    /// and on success return a [`MemoryGrant`] guard that owns the reservation
+    /// and releases it on drop; `None` if the request cannot be admitted.
+    ///
+    /// The grant a starting worker holds passes through several `.await` points
+    /// before the worker becomes resident (per-account concurrency, component
+    /// charge, filesystem storage); if that work is cancelled — as when the
+    /// worker is deleted while still waiting — the guard's drop returns the
+    /// reservation, so a cancelled start cannot leak headroom.
+    pub(crate) async fn admit(
+        self: &Arc<Self>,
+        request_bytes: u64,
+        source: &dyn EvictionSource,
+    ) -> Option<MemoryGrant> {
+        match self.try_admit(request_bytes, source).await {
+            AdmissionDecision::Admit => Some(MemoryGrant {
+                controller: Some(self.clone()),
+                bytes: request_bytes,
+            }),
+            AdmissionDecision::Reject => None,
+        }
+    }
+}
+
+/// Owns a memory reservation made with the [`AdmissionController`] and returns it
+/// to the gate when dropped, so a reservation is released exactly once regardless
+/// of whether the worker became resident or its start was cancelled.
+///
+/// When measured admission is disabled (no controller) the grant is inert: it
+/// reserves nothing and releasing it is a no-op, so callers can hold a grant
+/// uniformly without branching on whether admission is active.
+pub(crate) struct MemoryGrant {
+    controller: Option<Arc<AdmissionController>>,
+    bytes: u64,
+}
+
+impl MemoryGrant {
+    /// An inert grant for when measured admission is disabled: holds no
+    /// reservation and releases nothing on drop.
+    pub(crate) fn inert() -> Self {
+        Self {
+            controller: None,
+            bytes: 0,
+        }
+    }
+
+    /// Fold another grant's bytes into this one, so a worker that grows its
+    /// memory carries a single grant covering its whole reservation. The other
+    /// grant is consumed and its reservation transferred here; the combined total
+    /// is released exactly once when this grant drops.
+    pub(crate) fn merge(&mut self, mut other: MemoryGrant) {
+        if other.controller.is_some() {
+            // Adopt the controller so a merged grant acquired while admission was
+            // enabled still releases, even if `self` started inert.
+            if self.controller.is_none() {
+                self.controller = other.controller.take();
+            }
+            self.bytes += other.bytes;
+        }
+        // Neutralize the absorbed grant so its drop does not release the bytes
+        // now owned by `self`.
+        other.bytes = 0;
+        other.controller = None;
+    }
+}
+
+impl std::fmt::Debug for MemoryGrant {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("MemoryGrant")
+            .field("bytes", &self.bytes)
+            .finish()
+    }
+}
+
+impl Drop for MemoryGrant {
+    fn drop(&mut self) {
+        if let Some(controller) = &self.controller {
+            controller.release(self.bytes);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests;
diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs
new file mode 100644
index 0000000000..6f263930b3
--- /dev/null
+++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs
@@ -0,0 +1,1048 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Property-based and example tests for the measured-headroom admission valve.
+//!
+//! These tests model an executor environment as a shared cell holding a hard
+//! `limit`, the current resident `usage`, and the set of resident evictable
+//! work (each item carrying a size and an eviction priority). A [`FakeProbe`]
+//! reports `usage`/`limit` from the cell; a [`FakeEvictionSource`] reclaims
+//! idle-then-warm items and decrements `usage`. Admitting a request adds its
+//! size to `usage` as a new resident, non-evictable item (it is actively being
+//! created).
+//!
+//! The model lets `proptest` drive thousands of random admit sequences — with
+//! random request sizes, pre-resident work, and limits — and assert the
+//! invariants that *define* a correct safety valve:
+//!
+//! 1. Safety: usage never exceeds the limit (the environment never OOMs).
+//! 2. No spurious eviction: when headroom is ample, nothing is evicted.
+//! 3. Eviction ordering: idle work is reclaimed before warm work.
+//! 4. Clean rejection: when the request genuinely cannot fit, the decision is
+//!    `Reject` and no over-commit happens.
+
+use super::*;
+use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot};
+use proptest::prelude::*;
+use std::sync::{Arc, Mutex};
+use test_r::test;
+
+test_r::enable!();
+
+/// One unit of resident, evictable work in the model.
+#[derive(Debug, Clone, Copy)]
+struct Resident {
+    size: u64,
+    priority: EvictionPriority,
+}
+
+/// An admitted request whose pages have not yet fully faulted into RSS.
+///
+/// Models the gap between admission and residency: the worker has been admitted
+/// for `reserved` bytes but only `resident` of them have actually touched memory
+/// so far. Real RSS (what the probe reads) reflects only `resident`; the
+/// remaining `reserved - resident` bytes are still in flight and will appear in
+/// RSS later. This lag is what lets concurrent admissions on the same RSS
+/// snapshot collectively over-commit.
+#[derive(Debug, Clone, Copy)]
+struct InFlight {
+    reserved: u64,
+    resident: u64,
+}
+
+/// Shared model of the executor environment's memory.
+#[derive(Debug, Default)]
+struct EnvState {
+    limit: u64,
+    /// Resident bytes attributed to admitted, currently-active requests that
+    /// are not yet evictable (they are mid-admission).
+    pinned_usage: u64,
+    /// Resident, evictable work — what the controller may reclaim.
+    residents: Vec<Resident>,
+    /// Admitted requests whose pages are still faulting in. Their `resident`
+    /// portion counts toward measured RSS now; their full `reserved` size is
+    /// what RSS will reach once they are fully resident.
+    in_flight: Vec<InFlight>,
+    /// Count of evictions performed, for the no-spurious-eviction property.
+    evictions: usize,
+    /// The priorities evicted, in order, for the ordering property.
+    eviction_order: Vec<EvictionPriority>,
+}
+
+impl EnvState {
+    /// Measured RSS: the bytes that have actually faulted in. Lags behind what
+    /// has been admitted, because in-flight requests are only partially
+    /// resident. This is what the probe reports.
+    fn usage(&self) -> u64 {
+        self.pinned_usage
+            + self.residents.iter().map(|r| r.size).sum::<u64>()
+            + self.in_flight.iter().map(|f| f.resident).sum::<u64>()
+    }
+
+    /// Total bytes that admitted work will eventually occupy once every
+    /// in-flight request has fully faulted in. The safety property is stated
+    /// against this value: reserved bytes always become resident, so if this
+    /// can exceed the limit the environment will OOM once the lag resolves.
+    fn eventual_usage(&self) -> u64 {
+        self.pinned_usage
+            + self.residents.iter().map(|r| r.size).sum::<u64>()
+            + self.in_flight.iter().map(|f| f.reserved).sum::<u64>()
+    }
+
+    /// Advance residency: each in-flight request faults in up to `step` more of
+    /// its reserved bytes, raising measured RSS toward its eventual size.
+    /// Fully-resident requests are retired into `pinned_usage`.
+    fn tick_residency(&mut self, step: u64) {
+        for f in &mut self.in_flight {
+            let remaining = f.reserved - f.resident;
+            f.resident += remaining.min(step);
+        }
+        let (done, pending): (Vec<_>, Vec<_>) = self
+            .in_flight
+            .drain(..)
+            .partition(|f| f.resident >= f.reserved);
+        self.pinned_usage += done.iter().map(|f| f.reserved).sum::<u64>();
+        self.in_flight = pending;
+    }
+
+    /// Fault in `step` bytes of granted-but-untouched memory belonging to the
+    /// in-flight request at `index`, without faulting in any other request. A
+    /// worker may touch the virtual memory it was already granted at any later
+    /// time, with no admission call in the loop, so this raises measured RSS for
+    /// one worker in isolation.
+    fn fault_in_one(&mut self, index: usize, step: u64) {
+        if let Some(f) = self.in_flight.get_mut(index) {
+            let remaining = f.reserved - f.resident;
+            f.resident += remaining.min(step);
+        }
+    }
+
+    /// Remove the in-flight worker at `index`: it finishes and unloads, freeing
+    /// both its resident pages and its remaining grant. Measured RSS drops by its
+    /// resident portion. Returns the bytes it was admitted for, so the caller can
+    /// release the gate's reservation for it. The surviving workers' reservations
+    /// for their own untouched grants must not be credited by this drop.
+    fn exit_one(&mut self, index: usize) -> Option<u64> {
+        if index < self.in_flight.len() {
+            Some(self.in_flight.remove(index).reserved)
+        } else {
+            None
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct FakeProbe {
+    state: Arc<Mutex<EnvState>>,
+}
+
+impl MemoryProbe for FakeProbe {
+    fn snapshot(&self) -> MemorySnapshot {
+        let state = self.state.lock().unwrap();
+        MemorySnapshot {
+            limit_bytes: state.limit,
+            current_bytes: state.usage(),
+        }
+    }
+}
+
+struct FakeEvictionSource {
+    state: Arc<Mutex<EnvState>>,
+    /// The gate, so eviction can release each evicted resident's grant — in
+    /// production, eviction unloads the worker, which releases its grant.
+    controller: Arc<AdmissionController>,
+}
+
+#[async_trait::async_trait]
+impl EvictionSource for FakeEvictionSource {
+    async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64 {
+        let mut state = self.state.lock().unwrap();
+        let mut freed = 0u64;
+        // Evict only at the requested tier, oldest-first (model: vec order),
+        // until we have freed at least `needed_bytes` or the tier is empty.
+        let mut i = 0;
+        while freed < needed_bytes && i < state.residents.len() {
+            if state.residents[i].priority == priority {
+                let victim = state.residents.remove(i);
+                freed += victim.size;
+                self.controller.release(victim.size);
+                state.evictions += 1;
+                state.eviction_order.push(priority);
+            } else {
+                i += 1;
+            }
+        }
+        freed
+    }
+}
+
+fn controller(state: Arc<Mutex<EnvState>>) -> Arc<AdmissionController> {
+    controller_with_ratio(state, 1.0)
+}
+
+fn controller_with_ratio(
+    state: Arc<Mutex<EnvState>>,
+    usable_ratio: f64,
+) -> Arc<AdmissionController> {
+    // Workers already resident when the gate is created had their grants
+    // registered at their own admission; seed the gate to match.
+    let initial_granted = {
+        let s = state.lock().unwrap();
+        s.pinned_usage + s.residents.iter().map(|r| r.size).sum::<u64>()
+    };
+    let controller = AdmissionController::new(
+        Box::new(FakeProbe {
+            state: state.clone(),
+        }),
+        AdmissionPolicy { usable_ratio },
+    );
+    controller.seed_granted(initial_granted);
+    Arc::new(controller)
+}
+
+fn eviction_source(
+    state: Arc<Mutex<EnvState>>,
+    controller: Arc<AdmissionController>,
+) -> FakeEvictionSource {
+    FakeEvictionSource { state, controller }
+}
+
+/// Apply one admission attempt against the model, mutating `usage` on admit.
+async fn apply_admit(
+    controller: &AdmissionController,
+    source: &FakeEvictionSource,
+    state: &Arc<Mutex<EnvState>>,
+    request: u64,
+) -> AdmissionDecision {
+    let decision = controller.try_admit(request, source).await;
+    if decision == AdmissionDecision::Admit {
+        state.lock().unwrap().pinned_usage += request;
+    }
+    decision
+}
+
+/// Apply one admission attempt where admitted bytes do NOT become resident
+/// immediately. On admit the request is recorded as in-flight with zero resident
+/// bytes, so measured RSS is unchanged until a later residency tick faults its
+/// pages in. This models the real lag between admission and RSS, the window in
+/// which concurrent admissions on the same snapshot can collectively
+/// over-commit.
+async fn apply_staggered_admit(
+    controller: &AdmissionController,
+    source: &FakeEvictionSource,
+    state: &Arc<Mutex<EnvState>>,
+    request: u64,
+) -> AdmissionDecision {
+    let decision = controller.try_admit(request, source).await;
+    if decision == AdmissionDecision::Admit {
+        state.lock().unwrap().in_flight.push(InFlight {
+            reserved: request,
+            resident: 0,
+        });
+    }
+    decision
+}
+
+/// A probe with a fixed limit that always reports zero current usage, so the
+/// gate's admission decision is driven solely by the granted accounting against
+/// the ceiling. Used by the concurrency test, where the property under test is
+/// that the granted counter cannot be over-committed by racing admissions.
+#[derive(Debug)]
+struct ZeroUsageProbe {
+    limit: u64,
+}
+
+impl MemoryProbe for ZeroUsageProbe {
+    fn snapshot(&self) -> MemorySnapshot {
+        MemorySnapshot {
+            limit_bytes: self.limit,
+            current_bytes: 0,
+        }
+    }
+}
+
+/// An eviction source with nothing to evict: a rejected request stays rejected.
+struct NoEvictionSource;
+
+#[async_trait::async_trait]
+impl EvictionSource for NoEvictionSource {
+    async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 {
+        0
+    }
+}
+
+/// Concurrent admissions must never grant more than the ceiling allows.
+///
+/// Many admit attempts of equal size race against a controller whose ceiling
+/// admits only a known number of them, with no evictable work to fall back on.
+/// Exactly `ceiling / request` requests must be admitted and the rest rejected;
+/// the total granted must never exceed the ceiling. This can only hold if each
+/// admission's "is there room? then reserve" sequence is atomic against the
+/// others — if two admits read the same headroom before either reserves, both
+/// pass and the granted total overshoots the ceiling.
+#[test]
+fn concurrent_admissions_never_overcommit_the_ceiling() {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(8)
+        .build()
+        .unwrap();
+
+    rt.block_on(async {
+        const REQUEST: u64 = 10;
+        const CAPACITY: u64 = 50; // exactly 5 requests fit
+        const ATTEMPTS: usize = 200; // far more than fit, all racing
+
+        let controller = Arc::new(AdmissionController::new(
+            Box::new(ZeroUsageProbe { limit: CAPACITY }),
+            AdmissionPolicy { usable_ratio: 1.0 },
+        ));
+
+        let mut handles = Vec::with_capacity(ATTEMPTS);
+        for _ in 0..ATTEMPTS {
+            let controller = controller.clone();
+            handles.push(tokio::spawn(async move {
+                controller.try_admit(REQUEST, &NoEvictionSource).await
+            }));
+        }
+
+        let mut admitted = 0usize;
+        for handle in handles {
+            if handle.await.unwrap() == AdmissionDecision::Admit {
+                admitted += 1;
+            }
+        }
+
+        let expected = (CAPACITY / REQUEST) as usize;
+        assert_eq!(
+            admitted, expected,
+            "expected exactly {expected} admissions to fit, got {admitted}"
+        );
+        // With zero measured usage, headroom is the ceiling minus granted; if it
+        // equals the full ceiling again, everything admitted was released, which
+        // never happens here. The decisive check: the admitted total fits.
+        assert!(
+            admitted as u64 * REQUEST <= CAPACITY,
+            "granted {} exceeded ceiling {CAPACITY}",
+            admitted as u64 * REQUEST
+        );
+    });
+}
+
+// ── Single-case unit tests ───────────────────────────────────────────────────
+
+#[test]
+async fn admits_when_headroom_is_ample_without_evicting() {
+    let state = Arc::new(Mutex::new(EnvState {
+        limit: 1000,
+        pinned_usage: 0,
+        residents: vec![Resident {
+            size: 100,
+            priority: EvictionPriority::Idle,
+        }],
+        ..Default::default()
+    }));
+    let ctrl = controller(state.clone());
+    let source = eviction_source(state.clone(), ctrl.clone());
+
+    let decision = apply_admit(&ctrl, &source, &state, 200).await;
+    assert_eq!(decision, AdmissionDecision::Admit);
+    // Nothing should have been evicted — there was plenty of headroom.
+    assert_eq!(state.lock().unwrap().evictions, 0);
+}
+
+#[test]
+async fn evicts_idle_before_warm() {
+    let state = Arc::new(Mutex::new(EnvState {
+        limit: 1000,
+        pinned_usage: 0,
+        residents: vec![
+            Resident {
+                size: 400,
+                priority: EvictionPriority::Warm,
+            },
+            Resident {
+                size: 400,
+                priority: EvictionPriority::Idle,
+            },
+        ],
+        ..Default::default()
+    }));
+    // usage = 800, limit = 1000, headroom = 200. Request 300 → shortfall 100.
+    // One idle (400) covers it; warm must remain untouched.
+    let ctrl = controller(state.clone());
+    let source = eviction_source(state.clone(), ctrl.clone());
+
+    let decision = apply_admit(&ctrl, &source, &state, 300).await;
+    assert_eq!(decision, AdmissionDecision::Admit);
+
+    let s = state.lock().unwrap();
+    assert_eq!(s.eviction_order, vec![EvictionPriority::Idle]);
+    assert!(s.usage() <= s.limit);
+}
+
+#[test]
+async fn rejects_when_nothing_can_be_freed() {
+    let state = Arc::new(Mutex::new(EnvState {
+        limit: 1000,
+        // All usage is pinned (mid-admission), nothing evictable.
+        pinned_usage: 950,
+        residents: vec![],
+        ..Default::default()
+    }));
+    let ctrl = controller(state.clone());
+    let source = eviction_source(state.clone(), ctrl.clone());
+
+    let decision = apply_admit(&ctrl, &source, &state, 200).await;
+    assert_eq!(decision, AdmissionDecision::Reject);
+    // No over-commit: usage unchanged.
+    assert_eq!(state.lock().unwrap().usage(), 950);
+}
+
+// ── Property tests ───────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone)]
+enum Op {
+    Admit(u64),
+}
+
+/// An operation in a staggered-start schedule. Unlike [`Op`], admitted bytes do
+/// not become resident immediately — `Tick` advances residency separately, so
+/// the schedule can interleave admissions and page-faulting in any order.
+#[derive(Debug, Clone)]
+enum StaggeredOp {
+    /// Attempt to admit a worker reserving this many bytes.
+    Admit(u64),
+    /// Fault in up to this many more bytes of every in-flight worker.
+    Tick(u64),
+}
+
+fn arb_resident_priority() -> impl Strategy<Value = EvictionPriority> {
+    prop_oneof![Just(EvictionPriority::Idle), Just(EvictionPriority::Warm)]
+}
+
+fn arb_ops() -> impl Strategy<Value = Vec<Op>> {
+    prop::collection::vec((1u64..800).prop_map(Op::Admit), 0..40)
+}
+
+/// Strategy yielding a `(limit, residents)` start state where the residents fit
+/// under the limit by construction, by carving each resident's size out of a
+/// remaining budget. A resident set exceeding the limit cannot occur in reality
+/// (it would already have been OOM-killed), so it is not a valid start state.
+fn arb_fitting_state(
+    limit_range: std::ops::Range<u64>,
+    max_residents: usize,
+) -> impl Strategy<Value = (u64, Vec<Resident>)> {
+    limit_range.prop_flat_map(move |limit| {
+        // Reserve a fraction of the limit for residents (0..=80%) so there is
+        // usually some free headroom in the start state too. Each resident then
+        // takes a slice of that budget.
+        (
+            Just(limit),
+            (0u64..=(limit * 4 / 5)),
+            prop::collection::vec((1u64..=1000, arb_resident_priority()), 0..max_residents),
+        )
+            .prop_map(|(limit, mut budget, raw)| {
+                let mut residents = Vec::new();
+                for (weight, priority) in raw {
+                    if budget == 0 {
+                        break;
+                    }
+                    // Each resident is at most a third of the remaining budget,
+                    // so several can coexist; clamp to whatever budget is left.
+                    let size = weight.min(budget.div_ceil(3)).max(1).min(budget);
+                    residents.push(Resident { size, priority });
+                    budget -= size;
+                }
+                (limit, residents)
+            })
+    })
+}
+
+proptest! {
+    /// Safety invariant: across any random sequence of admits — with random
+    /// pre-resident work and random sizes — modeled usage must never exceed the
+    /// limit. This is the property that rules out OOM.
+    #[test]
+    fn usage_never_exceeds_limit(
+        (limit, residents) in arb_fitting_state(500..5000, 20),
+        ops in arb_ops(),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState {
+                limit,
+                pinned_usage: 0,
+                residents,
+                ..Default::default()
+            }));
+            let ctrl = controller(state.clone());
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            for op in ops {
+                match op {
+                    Op::Admit(req) => {
+                        apply_admit(&ctrl, &source, &state, req).await;
+                        let s = state.lock().unwrap();
+                        prop_assert!(
+                            s.usage() <= s.limit,
+                            "usage {} exceeded limit {}", s.usage(), s.limit
+                        );
+                    }
+                }
+            }
+            Ok(())
+        }).unwrap();
+    }
+
+    /// No spurious eviction: if every admit in the sequence fits within the
+    /// admissible headroom at the moment it is issued, nothing is ever evicted.
+    /// We guarantee the precondition by giving a huge limit and small requests.
+    #[test]
+    fn no_eviction_when_headroom_ample(
+        residents in prop::collection::vec(
+            (1u64..500, arb_resident_priority())
+                .prop_map(|(size, priority)| Resident { size, priority }),
+            0..20,
+        ),
+        ops in prop::collection::vec((1u64..50).prop_map(Op::Admit), 0..30),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState {
+                limit: 1_000_000,
+                pinned_usage: 0,
+                residents,
+                ..Default::default()
+            }));
+            let ctrl = controller(state.clone());
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            for op in ops {
+                match op {
+                    Op::Admit(req) => { apply_admit(&ctrl, &source, &state, req).await; }
+                }
+            }
+            prop_assert_eq!(state.lock().unwrap().evictions, 0);
+            Ok(())
+        }).unwrap();
+    }
+
+    /// Eviction ordering: whenever eviction happens, no warm item is evicted
+    /// while an idle item was still available to evict at that step. We check
+    /// the weaker, order-level invariant that the recorded eviction order never
+    /// has a warm eviction before an idle one within a single `try_admit` call
+    /// — i.e. idle is always drained first.
+    #[test]
+    fn idle_evicted_before_warm(
+        (limit, residents) in arb_fitting_state(500..3000, 25),
+        ops in prop::collection::vec((1u64..1500).prop_map(Op::Admit), 1..20),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState {
+                limit,
+                pinned_usage: 0,
+                residents,
+                ..Default::default()
+            }));
+            let ctrl = controller(state.clone());
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            for op in ops {
+                match op {
+                    Op::Admit(req) => { apply_admit(&ctrl, &source, &state, req).await; }
+                }
+            }
+
+            // Once a warm eviction appears in the order, an idle eviction must
+            // never follow it (idle is always exhausted first).
+            let order = state.lock().unwrap().eviction_order.clone();
+            let mut seen_warm = false;
+            for p in order {
+                match p {
+                    EvictionPriority::Warm => seen_warm = true,
+                    EvictionPriority::Idle => prop_assert!(
+                        !seen_warm,
+                        "idle eviction followed a warm eviction"
+                    ),
+                }
+            }
+            Ok(())
+        }).unwrap();
+    }
+}
+
+// ── Staggered-start safety ───────────────────────────────────────────────────
+
+/// A schedule of admissions interleaved with residency ticks. Admissions
+/// reserve bytes that only become resident when a later `Tick` faults them in,
+/// so the schedule exercises the lag between admission and measured RSS in which
+/// concurrent admissions can collectively over-commit. Skewed toward `Admit` so
+/// bursts of admissions land between ticks (the dangerous case).
+fn arb_staggered_schedule() -> impl Strategy<Value = Vec<StaggeredOp>> {
+    prop::collection::vec(
+        prop_oneof![
+            3 => (1u64..800).prop_map(StaggeredOp::Admit),
+            1 => (1u64..800).prop_map(StaggeredOp::Tick),
+        ],
+        0..60,
+    )
+}
+
+proptest! {
+    /// Safety invariant under staggered starts: for any interleaving of
+    /// admissions and residency ticks, once every admitted worker has fully
+    /// faulted its pages in, resident usage must not exceed the limit.
+    ///
+    /// Reserved bytes always eventually become resident, so the check is made
+    /// against the state after a final full-residency tick: if that can exceed
+    /// the limit, the environment OOMs once the admission lag resolves. This is
+    /// the general form of the staggered-burst case — admissions that read the
+    /// same low RSS snapshot before each other's pages are counted.
+    #[test]
+    fn staggered_starts_never_exceed_limit_once_resident(
+        (limit, residents) in arb_fitting_state(500..5000, 20),
+        schedule in arb_staggered_schedule(),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState {
+                limit,
+                pinned_usage: 0,
+                residents,
+                ..Default::default()
+            }));
+            let ctrl = controller(state.clone());
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            for op in schedule {
+                match op {
+                    StaggeredOp::Admit(req) => {
+                        apply_staggered_admit(&ctrl, &source, &state, req).await;
+                    }
+                    StaggeredOp::Tick(step) => {
+                        state.lock().unwrap().tick_residency(step);
+                    }
+                }
+                // Even mid-flight, measured RSS must never exceed the limit.
+                let s = state.lock().unwrap();
+                prop_assert!(
+                    s.usage() <= s.limit,
+                    "resident usage {} exceeded limit {} mid-schedule", s.usage(), s.limit
+                );
+            }
+
+            // Fault in everything still in flight, then check the eventual
+            // resident footprint fits.
+            state.lock().unwrap().tick_residency(u64::MAX);
+            let s = state.lock().unwrap();
+            prop_assert!(
+                s.eventual_usage() <= s.limit,
+                "eventual resident usage {} exceeded limit {} once fully resident",
+                s.eventual_usage(), s.limit
+            );
+            Ok(())
+        }).unwrap();
+    }
+}
+
+// ── Granted virtual memory ───────────────────────────────────────────────────
+
+/// One step of a schedule that stresses granted-but-untouched memory.
+#[derive(Debug, Clone)]
+enum GrantOp {
+    /// Attempt to admit a worker granted this many bytes of linear memory.
+    Grant(u64),
+    /// Fault in up to this many bytes of the in-flight worker at this index,
+    /// in isolation from the others.
+    FaultIn(usize, u64),
+    /// The in-flight worker at this index finishes and unloads, dropping its
+    /// resident pages and its remaining grant.
+    Exit(usize),
+}
+
+fn arb_grant_schedule() -> impl Strategy<Value = Vec<GrantOp>> {
+    prop::collection::vec(
+        prop_oneof![
+            3 => (1u64..800).prop_map(GrantOp::Grant),
+            3 => (0usize..20, 1u64..800).prop_map(|(i, step)| GrantOp::FaultIn(i, step)),
+            1 => (0usize..20).prop_map(GrantOp::Exit),
+        ],
+        0..80,
+    )
+}
+
+proptest! {
+    /// A worker may fault in the virtual memory it was already granted at any
+    /// later time, with no admission call in the loop. Once every granted byte
+    /// of every admitted worker becomes resident, that resident footprint must
+    /// not exceed the limit.
+    ///
+    /// Granted bytes can always become resident — nothing in the runtime forces
+    /// a worker to leave granted pages untouched — so the safety check is made
+    /// against the sum of granted sizes after faulting everything in. If that
+    /// can exceed the limit, a node of workers touching their already-granted
+    /// pages will OOM with no grow and no admission to intercept it.
+    #[test]
+    fn granted_memory_never_exceeds_limit_once_faulted_in(
+        limit in 800u64..6000,
+        schedule in arb_grant_schedule(),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let state = Arc::new(Mutex::new(EnvState { limit, ..Default::default() }));
+            // usable_ratio 1.0 isolates the granted-memory hole from the host
+            // carve-out.
+            let ctrl = controller(state.clone());
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            for op in schedule {
+                match op {
+                    GrantOp::Grant(bytes) => {
+                        apply_staggered_admit(&ctrl, &source, &state, bytes).await;
+                    }
+                    GrantOp::FaultIn(index, step) => {
+                        state.lock().unwrap().fault_in_one(index, step);
+                    }
+                    GrantOp::Exit(index) => {
+                        let reserved = state.lock().unwrap().exit_one(index);
+                        if let Some(reserved) = reserved {
+                            ctrl.release(reserved);
+                        }
+                    }
+                }
+                let s = state.lock().unwrap();
+                prop_assert!(
+                    s.usage() <= s.limit,
+                    "resident usage {} exceeded limit {} mid-schedule", s.usage(), s.limit
+                );
+            }
+
+            // Every granted byte may yet fault in. Once it all does, it must fit.
+            state.lock().unwrap().tick_residency(u64::MAX);
+            let s = state.lock().unwrap();
+            prop_assert!(
+                s.eventual_usage() <= s.limit,
+                "granted memory {} exceeded limit {} once fully faulted in",
+                s.eventual_usage(), s.limit
+            );
+            Ok(())
+        }).unwrap();
+    }
+
+    /// Liveness: once every admitted worker has unloaded and its pages have left
+    /// memory, the gate's admissible headroom must return to the full ceiling.
+    ///
+    /// Reservations for workers that exit while still holding untouched granted
+    /// memory must be released on unload. If they were not, each such exit would
+    /// permanently shrink headroom, and a node churning workers would slowly
+    /// refuse all admissions despite being empty.
+    #[test]
+    fn headroom_recovers_after_all_workers_exit(
+        limit in 800u64..6000,
+        schedule in arb_grant_schedule(),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let usable_ratio = 0.8;
+            let state = Arc::new(Mutex::new(EnvState { limit, ..Default::default() }));
+            let ctrl = controller_with_ratio(state.clone(), usable_ratio);
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            for op in schedule {
+                match op {
+                    GrantOp::Grant(bytes) => {
+                        apply_staggered_admit(&ctrl, &source, &state, bytes).await;
+                    }
+                    GrantOp::FaultIn(index, step) => {
+                        state.lock().unwrap().fault_in_one(index, step);
+                    }
+                    GrantOp::Exit(index) => {
+                        let reserved = state.lock().unwrap().exit_one(index);
+                        if let Some(reserved) = reserved {
+                            ctrl.release(reserved);
+                        }
+                    }
+                }
+            }
+
+            // Unload every worker still resident, releasing each reservation, and
+            // clear measured RSS — the environment is now empty.
+            loop {
+                let reserved = state.lock().unwrap().exit_one(0);
+                match reserved {
+                    Some(reserved) => ctrl.release(reserved),
+                    None => break,
+                }
+            }
+            {
+                let mut s = state.lock().unwrap();
+                s.pinned_usage = 0;
+                s.residents.clear();
+            }
+
+            let ceiling = (limit as f64 * usable_ratio) as u64;
+            let headroom = ctrl.headroom_bytes();
+            prop_assert_eq!(
+                headroom, ceiling,
+                "headroom {} did not recover to ceiling {} after all workers exited",
+                headroom, ceiling
+            );
+            Ok(())
+        }).unwrap();
+    }
+}
+
+// ── Density ──────────────────────────────────────────────────────────────────
+
+proptest! {
+    /// Density invariant: in a settled state (no admission lag outstanding), the
+    /// gate packs the environment to within one request of the usable ceiling
+    /// before it starts rejecting. It must not stop admitting while substantial
+    /// usable room remains.
+    ///
+    /// The schedule admits a fixed request size, fully faulting each admitted
+    /// worker in before the next admit so measured RSS tracks admitted bytes and
+    /// the in-flight reservation drains to zero — the steady-state regime where
+    /// density matters. At the first rejection, resident usage must be at least
+    /// `ceiling - request`: the only room a correct gate may leave free is the
+    /// part too small to fit one more request.
+    #[test]
+    fn admits_to_within_one_request_of_the_ceiling(
+        limit in 2000u64..20_000,
+        request in 50u64..600,
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let usable_ratio = 0.8;
+            let state = Arc::new(Mutex::new(EnvState {
+                limit,
+                ..Default::default()
+            }));
+            let ctrl = controller_with_ratio(state.clone(), usable_ratio);
+            let source = eviction_source(state.clone(), ctrl.clone());
+
+            let ceiling = (limit as f64 * usable_ratio) as u64;
+
+            // Admit until the first rejection, faulting each worker fully in
+            // before the next so no reservation lag is outstanding.
+            let mut rejected = false;
+            for _ in 0..((limit / request) + 2) {
+                let decision = apply_staggered_admit(&ctrl, &source, &state, request).await;
+                state.lock().unwrap().tick_residency(u64::MAX);
+                if decision == AdmissionDecision::Reject {
+                    rejected = true;
+                    break;
+                }
+            }
+
+            prop_assert!(rejected, "gate never rejected; ceiling {ceiling} too large for the schedule");
+
+            let s = state.lock().unwrap();
+            prop_assert!(
+                s.usage() + request > ceiling,
+                "gate rejected at resident usage {} with ceiling {ceiling}: left more than one request ({request}) of usable room free",
+                s.usage()
+            );
+            // And it must never have over-committed.
+            prop_assert!(s.eventual_usage() <= s.limit);
+            Ok(())
+        }).unwrap();
+    }
+}
+
+// ── Carve-out ratio ──────────────────────────────────────────────────────────
+
+#[test]
+async fn usable_ratio_caps_admission_below_full_limit() {
+    let state = Arc::new(Mutex::new(EnvState {
+        limit: 1000,
+        pinned_usage: 0,
+        residents: vec![],
+        ..Default::default()
+    }));
+    // ceiling = 0.8 * 1000 = 800. Request 850 must be rejected even though the
+    // raw limit (1000) would allow it — the top 20% is reserved for the host.
+    let ctrl = controller_with_ratio(state.clone(), 0.8);
+    let source = eviction_source(state.clone(), ctrl.clone());
+
+    assert_eq!(
+        apply_admit(&ctrl, &source, &state, 850).await,
+        AdmissionDecision::Reject
+    );
+    assert_eq!(
+        apply_admit(&ctrl, &source, &state, 800).await,
+        AdmissionDecision::Admit
+    );
+}
+
+/// Concurrent memory grows must not deadlock against the admission eviction
+/// scan.
+///
+/// A memory grow acquires a permit while the growing worker holds its own
+/// instance lock, and the admission slow path scans the worker set, taking each
+/// other worker's instance lock to classify it for eviction. With many workers
+/// growing at once under memory pressure these two must not form an AB-BA cycle.
+/// Workloads that never grow memory never exercise this path.
+mod grow_lock_ordering {
+    use super::super::{AdmissionController, AdmissionPolicy, EvictionPriority, EvictionSource};
+    use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot};
+    use std::sync::Arc;
+    use std::time::Duration;
+    use test_r::test;
+    use tokio::sync::Mutex as AsyncMutex;
+
+    /// Per-worker lock, standing in for `Worker::instance`.
+    type WorkerLock = Arc<AsyncMutex<()>>;
+
+    /// Probe pinned to zero admissible headroom so `try_admit` takes the slow
+    /// (scanning) path, modelling the moment a grow's requested delta does not
+    /// fit the current headroom.
+    #[derive(Debug)]
+    struct SaturatedProbe;
+
+    impl MemoryProbe for SaturatedProbe {
+        fn snapshot(&self) -> MemorySnapshot {
+            MemorySnapshot {
+                limit_bytes: 1,
+                current_bytes: u64::MAX,
+            }
+        }
+    }
+
+    /// Probe reporting ample headroom so `try_admit` takes the fast path and
+    /// never scans — the same grow code path, but not under memory pressure.
+    #[derive(Debug)]
+    struct AmpleHeadroomProbe;
+
+    impl MemoryProbe for AmpleHeadroomProbe {
+        fn snapshot(&self) -> MemorySnapshot {
+            MemorySnapshot {
+                limit_bytes: u64::MAX,
+                current_bytes: 0,
+            }
+        }
+    }
+
+    /// Eviction source that, like `evict_at_most_memory`, scans every worker and
+    /// takes each worker's instance lock (via `eviction_class`) to classify it.
+    /// Frees nothing (all workers active). The lock on each worker is held only
+    /// briefly, faithfully — the deadlock comes from the ordering, not hold time.
+    struct ScanningEvictionSource {
+        workers: Vec<WorkerLock>,
+    }
+
+    #[async_trait::async_trait]
+    impl EvictionSource for ScanningEvictionSource {
+        async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 {
+            for worker in &self.workers {
+                let _guard = worker.lock().await;
+            }
+            0
+        }
+    }
+
+    /// Models the grow path's lock interaction: run the admission scan, which
+    /// takes other workers' instance locks, without holding this worker's own
+    /// instance lock, then take it afterwards to merge the permit (as
+    /// `Worker::increase_memory` does).
+    async fn grow_then_lock(
+        controller: &AdmissionController,
+        own: &WorkerLock,
+        workers: Vec<WorkerLock>,
+    ) {
+        let source = ScanningEvictionSource { workers };
+        controller.try_admit(1, &source).await;
+        let _own_guard = own.lock().await;
+    }
+
+    fn workers(n: usize) -> Vec<WorkerLock> {
+        (0..n).map(|_| Arc::new(AsyncMutex::new(()))).collect()
+    }
+
+    fn controller(probe: Box<dyn MemoryProbe>) -> Arc<AdmissionController> {
+        Arc::new(AdmissionController::new(
+            probe,
+            AdmissionPolicy { usable_ratio: 1.0 },
+        ))
+    }
+
+    /// Many workers growing concurrently under memory pressure (every grow takes
+    /// the scanning slow path) must all complete without deadlocking.
+    #[test(flavor = "multi_thread", worker_threads = 4)]
+    async fn concurrent_grows_do_not_deadlock_under_pressure() {
+        const WORKERS: usize = 32;
+        const DEADLINE: Duration = Duration::from_secs(10);
+
+        let workers = workers(WORKERS);
+        let controller = controller(Box::new(SaturatedProbe));
+
+        let mut grows = Vec::new();
+        for i in 0..WORKERS {
+            let controller = controller.clone();
+            let all = workers.clone();
+            let own = workers[i].clone();
+            grows.push(tokio::spawn(async move {
+                grow_then_lock(&controller, &own, all).await;
+            }));
+        }
+
+        let all_done = async {
+            for task in grows {
+                let _ = task.await;
+            }
+        };
+
+        let result = tokio::time::timeout(DEADLINE, all_done).await;
+        assert!(
+            result.is_ok(),
+            "concurrent grows deadlocked: the scan must not run while a worker holds its own instance lock"
+        );
+    }
+
+    /// With comfortable headroom the gate admits on the fast path without
+    /// scanning, so no worker's instance lock is taken during admission and
+    /// concurrent grows complete. Confirms the deadlock risk is specific to the
+    /// scan-under-pressure path.
+    #[test(flavor = "multi_thread", worker_threads = 4)]
+    async fn no_deadlock_with_ample_headroom() {
+        const WORKERS: usize = 32;
+        const DEADLINE: Duration = Duration::from_secs(10);
+
+        let workers = workers(WORKERS);
+        let controller = controller(Box::new(AmpleHeadroomProbe));
+
+        let mut grows = Vec::new();
+        for i in 0..WORKERS {
+            let controller = controller.clone();
+            let all = workers.clone();
+            let own = workers[i].clone();
+            grows.push(tokio::spawn(async move {
+                grow_then_lock(&controller, &own, all).await;
+            }));
+        }
+
+        let all_done = async {
+            for task in grows {
+                let _ = task.await;
+            }
+        };
+
+        let result = tokio::time::timeout(DEADLINE, all_done).await;
+        assert!(
+            result.is_ok(),
+            "grows with ample headroom should not scan and should not deadlock"
+        );
+    }
+}
diff --git a/golem-worker-executor/src/services/active_workers/component_charge/mod.rs b/golem-worker-executor/src/services/active_workers/component_charge/mod.rs
new file mode 100644
index 0000000000..8ddd4aa8aa
--- /dev/null
+++ b/golem-worker-executor/src/services/active_workers/component_charge/mod.rs
@@ -0,0 +1,171 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Per-component memory charge for the shared compiled module.
+//!
+//! A component's compiled module is loaded into the wasmtime engine once and
+//! shared by every worker of that component, so its size must be charged to the
+//! memory pool once per resident component rather than once per worker. This
+//! registry tracks how many workers of each component are resident and holds a
+//! single module-sized charge for as long as at least one is.
+//!
+//! The charge is represented by an opaque guard obtained from a [`ChargeSource`]
+//! (the worker memory pool in production). The first resident worker of a
+//! component acquires the charge; the last to unload drops it. The registry is
+//! decoupled from the pool via [`ChargeSource`] so the refcounting can be
+//! property-tested in isolation.
+
+use async_trait::async_trait;
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::sync::{Arc, Mutex};
+
+/// Acquires an opaque, RAII charge of a given byte size from some pool. The
+/// returned value releases the charge when dropped.
+#[async_trait]
+pub trait ChargeSource: Send + Sync {
+    type Charge: Send + Sync + 'static;
+
+    async fn acquire_charge(&self, bytes: u64) -> Self::Charge;
+}
+
+/// Tracks resident-worker refcounts per component key and holds one module-sized
+/// charge per component while any worker of it is resident.
+pub struct ComponentChargeRegistry<K, S: ChargeSource> {
+    source: S,
+    state: Mutex<HashMap<K, Entry<S::Charge>>>,
+}
+
+struct Entry<C> {
+    refcount: usize,
+    /// The held module charge. Always `Some` while `refcount > 0`.
+    charge: Option<Arc<C>>,
+}
+
+/// Handle representing one worker's residency of a component. While at least one
+/// `ComponentChargeGuard` for a key is alive, the registry holds that
+/// component's module charge. Dropping the last guard releases it.
+pub struct ComponentChargeGuard<K, S: ChargeSource>
+where
+    K: Eq + Hash + Clone + Send + 'static,
+{
+    registry: Arc<ComponentChargeRegistry<K, S>>,
+    key: K,
+}
+
+impl<K, S> Debug for ComponentChargeGuard<K, S>
+where
+    K: Eq + Hash + Clone + Send + 'static,
+    S: ChargeSource,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ComponentChargeGuard").finish()
+    }
+}
+
+/// Type-erased held component charge. A worker holds one of these for as long as
+/// it is resident; dropping it releases the worker's residency of its component.
+/// Erasing the source/key types lets non-generic holders store the guard.
+pub trait HeldComponentCharge: Send + Sync + Debug {}
+
+impl<K, S> HeldComponentCharge for ComponentChargeGuard<K, S>
+where
+    K: Eq + Hash + Clone + Send + Sync + 'static,
+    S: ChargeSource + 'static,
+    S::Charge: Sync,
+{
+}
+
+impl<K, S> ComponentChargeRegistry<K, S>
+where
+    K: Eq + Hash + Clone + Send + 'static,
+    S: ChargeSource,
+{
+    pub fn new(source: S) -> Arc<Self> {
+        Arc::new(Self {
+            source,
+            state: Mutex::new(HashMap::new()),
+        })
+    }
+
+    /// Register one resident worker of `key` (whose module is `charge_bytes`).
+    /// Acquires the module charge if this is the first resident worker of the
+    /// component. The returned guard releases residency on drop.
+    pub async fn acquire(
+        self: &Arc<Self>,
+        key: K,
+        charge_bytes: u64,
+    ) -> ComponentChargeGuard<K, S> {
+        // Decide under the lock whether this caller is the one that must acquire
+        // the (possibly blocking) charge, so only the first resident worker of a
+        // component does so. Acquire the charge outside the lock, then publish it.
+        let must_acquire = {
+            let mut state = self.state.lock().unwrap();
+            let entry = state.entry(key.clone()).or_insert(Entry {
+                refcount: 0,
+                charge: None,
+            });
+            entry.refcount += 1;
+            entry.refcount == 1
+        };
+
+        if must_acquire {
+            let charge = Arc::new(self.source.acquire_charge(charge_bytes).await);
+            let mut state = self.state.lock().unwrap();
+            if let Some(entry) = state.get_mut(&key) {
+                // Only publish if still resident (refcount could have churned).
+                if entry.refcount > 0 && entry.charge.is_none() {
+                    entry.charge = Some(charge);
+                }
+            }
+        }
+
+        ComponentChargeGuard {
+            registry: self.clone(),
+            key,
+        }
+    }
+
+    fn release(&self, key: &K) {
+        let mut state = self.state.lock().unwrap();
+        if let Some(entry) = state.get_mut(key) {
+            entry.refcount = entry.refcount.saturating_sub(1);
+            if entry.refcount == 0 {
+                // Drop the held charge (returns it to the pool) and forget the
+                // component entirely.
+                state.remove(key);
+            }
+        }
+    }
+}
+
+impl<K, S> Drop for ComponentChargeGuard<K, S>
+where
+    K: Eq + Hash + Clone + Send + 'static,
+    S: ChargeSource,
+{
+    fn drop(&mut self) {
+        self.registry.release(&self.key);
+    }
+}
+
+impl<K, S: ChargeSource> Debug for ComponentChargeRegistry<K, S> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ComponentChargeRegistry").finish()
+    }
+}
+
+#[cfg(test)]
+mod tests;
diff --git a/golem-worker-executor/src/services/active_workers/component_charge/tests.rs b/golem-worker-executor/src/services/active_workers/component_charge/tests.rs
new file mode 100644
index 0000000000..c58f1ab937
--- /dev/null
+++ b/golem-worker-executor/src/services/active_workers/component_charge/tests.rs
@@ -0,0 +1,206 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Tests for the per-component module charge registry.
+//!
+//! A [`FakeChargeSource`] models a pool by tracking total charged bytes in an
+//! atomic; each charge it hands out decrements that total when dropped. The
+//! tests then assert the registry's contract: a component's module is charged
+//! exactly once while any worker of it is resident, released when the last
+//! unloads, and never leaked or double-charged under concurrent churn.
+
+use super::*;
+use proptest::prelude::*;
+use std::sync::atomic::{AtomicU64, Ordering};
+use test_r::test;
+
+test_r::enable!();
+
+/// A charge that returns `bytes` to the shared counter when dropped.
+struct FakeCharge {
+    bytes: u64,
+    charged_total: Arc<AtomicU64>,
+}
+
+impl Drop for FakeCharge {
+    fn drop(&mut self) {
+        self.charged_total.fetch_sub(self.bytes, Ordering::SeqCst);
+    }
+}
+
+#[derive(Clone)]
+struct FakeChargeSource {
+    charged_total: Arc<AtomicU64>,
+    /// Number of times a charge was actually acquired, to detect double-charge.
+    acquire_count: Arc<AtomicU64>,
+}
+
+impl FakeChargeSource {
+    fn new() -> Self {
+        Self {
+            charged_total: Arc::new(AtomicU64::new(0)),
+            acquire_count: Arc::new(AtomicU64::new(0)),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ChargeSource for FakeChargeSource {
+    type Charge = FakeCharge;
+
+    async fn acquire_charge(&self, bytes: u64) -> FakeCharge {
+        self.acquire_count.fetch_add(1, Ordering::SeqCst);
+        self.charged_total.fetch_add(bytes, Ordering::SeqCst);
+        FakeCharge {
+            bytes,
+            charged_total: self.charged_total.clone(),
+        }
+    }
+}
+
+const MODULE_BYTES: u64 = 17 * 1024 * 1024;
+
+// ── Single-case unit tests ───────────────────────────────────────────────────
+
+#[test]
+async fn first_worker_charges_once_last_releases() {
+    let source = FakeChargeSource::new();
+    let charged = source.charged_total.clone();
+    let count = source.acquire_count.clone();
+    let registry = ComponentChargeRegistry::new(source);
+
+    let g1 = registry.acquire("comp-a", MODULE_BYTES).await;
+    assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES);
+    assert_eq!(count.load(Ordering::SeqCst), 1);
+
+    // Second worker of the same component: no additional charge.
+    let g2 = registry.acquire("comp-a", MODULE_BYTES).await;
+    assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES);
+    assert_eq!(count.load(Ordering::SeqCst), 1);
+
+    // Dropping one of two keeps the charge.
+    drop(g1);
+    assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES);
+
+    // Dropping the last releases it.
+    drop(g2);
+    assert_eq!(charged.load(Ordering::SeqCst), 0);
+}
+
+#[test]
+async fn distinct_components_each_charge_once() {
+    let source = FakeChargeSource::new();
+    let charged = source.charged_total.clone();
+    let registry = ComponentChargeRegistry::new(source);
+
+    let _a = registry.acquire("comp-a", MODULE_BYTES).await;
+    let _b = registry.acquire("comp-b", MODULE_BYTES).await;
+    let _b2 = registry.acquire("comp-b", MODULE_BYTES).await;
+
+    // Two distinct components → charged twice, regardless of worker count.
+    assert_eq!(charged.load(Ordering::SeqCst), 2 * MODULE_BYTES);
+}
+
+#[test]
+async fn re_acquiring_after_full_release_charges_again() {
+    let source = FakeChargeSource::new();
+    let charged = source.charged_total.clone();
+    let count = source.acquire_count.clone();
+    let registry = ComponentChargeRegistry::new(source);
+
+    drop(registry.acquire("comp-a", MODULE_BYTES).await);
+    assert_eq!(charged.load(Ordering::SeqCst), 0);
+
+    // A fresh residency after full release acquires the charge again.
+    let _g = registry.acquire("comp-a", MODULE_BYTES).await;
+    assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES);
+    assert_eq!(count.load(Ordering::SeqCst), 2);
+}
+
+// ── Property tests ───────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone)]
+enum Op {
+    /// Acquire a guard for component index `usize`.
+    Acquire(usize),
+    /// Drop the n-th currently-held guard (modulo number held).
+    Drop(usize),
+}
+
+fn arb_ops(num_components: usize) -> impl Strategy<Value = Vec<Op>> {
+    prop::collection::vec(
+        prop_oneof![
+            (0..num_components).prop_map(Op::Acquire),
+            (0usize..100).prop_map(Op::Drop),
+        ],
+        0..80,
+    )
+}
+
+proptest! {
+    /// The charged total always equals the sum of `MODULE_BYTES` over the distinct
+    /// components that currently have at least one held guard. This is the core
+    /// "once per resident component" contract: never per-worker, never leaked,
+    /// never double-charged.
+    #[test]
+    fn charge_tracks_distinct_resident_components(
+        num_components in 1usize..6,
+        ops in arb_ops(6),
+    ) {
+        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
+        rt.block_on(async move {
+            let source = FakeChargeSource::new();
+            let charged = source.charged_total.clone();
+            let registry = ComponentChargeRegistry::new(source);
+
+            // Held guards keyed by component index.
+            let mut held: Vec<(usize, ComponentChargeGuard<&'static str, FakeChargeSource>)> =
+                Vec::new();
+            let keys: Vec<&'static str> =
+                ["c0", "c1", "c2", "c3", "c4", "c5"][..num_components].to_vec();
+
+            for op in ops {
+                match op {
+                    Op::Acquire(i) => {
+                        let i = i % num_components;
+                        let guard = registry.acquire(keys[i], MODULE_BYTES).await;
+                        held.push((i, guard));
+                    }
+                    Op::Drop(n) => {
+                        if !held.is_empty() {
+                            let idx = n % held.len();
+                            held.remove(idx);
+                        }
+                    }
+                }
+
+                // Distinct resident component count == charged_total / MODULE_BYTES.
+                let mut distinct: Vec<usize> = held.iter().map(|(i, _)| *i).collect();
+                distinct.sort_unstable();
+                distinct.dedup();
+                let expected = distinct.len() as u64 * MODULE_BYTES;
+                prop_assert_eq!(
+                    charged.load(Ordering::SeqCst),
+                    expected,
+                    "charged total did not match distinct resident components"
+                );
+            }
+
+            // After dropping everything, nothing remains charged.
+            drop(held);
+            prop_assert_eq!(charged.load(Ordering::SeqCst), 0);
+            Ok(())
+        }).unwrap();
+    }
+}
diff --git a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs
index 77c3f74b86..3d20d187b6 100644
--- a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs
+++ b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs
@@ -48,42 +48,91 @@ struct AccountSchedulerState {
 
 struct QueuedAgent {
     agent_id: AgentId,
-    waker: tokio::sync::oneshot::Sender<OwnedSemaphorePermit>,
+    waker: tokio::sync::oneshot::Sender<GrantedSlot>,
 }
 
-/// RAII permit returned by [`ConcurrentAgentsScheduler::acquire`].
+/// A slot granted from the scheduler: owns the underlying semaphore permit and
+/// the responsibility to decrement the account's `running_count` and wake the
+/// next queued agent when it is released.
 ///
-/// On drop, decrements the account's running count and wakes the next queued
-/// agent (if any). The drop handler is fully synchronous.
-pub struct ConcurrentAgentPermit {
+/// The `running_count` is incremented together with acquiring the raw permit,
+/// and the matching decrement lives only here in `Drop`. This binds the count
+/// strictly to the lifetime of the granted permit, regardless of how the slot
+/// is disposed of:
+///
+/// * it is moved into a [`ConcurrentAgentPermit`] and dropped when the agent
+///   releases the slot (the normal case), or
+/// * it is sent into a queued waiter's oneshot and that waiter is cancelled
+///   before receiving it — the slot is then dropped inside the channel.
+///
+/// Both paths run this same `Drop`, so a slot granted to a waiter that is
+/// cancelled after the grant succeeded cannot leak the count.
+struct GrantedSlot {
     raw: Option<OwnedSemaphorePermit>,
-    account: Option<Arc<AccountScheduler>>,
+    account: Arc<AccountScheduler>,
     account_id: AccountId,
 }
 
-impl Drop for ConcurrentAgentPermit {
+impl Drop for GrantedSlot {
     fn drop(&mut self) {
         if let Some(raw) = self.raw.take() {
             // Return the raw permit to the semaphore first so it is available
             // for the next queued agent's synchronous try-acquire.
             drop(raw);
-
-            if let Some(ref account) = self.account {
-                try_grant_next_sync(account, &self.account_id);
-            }
+            try_grant_next_sync(&self.account, &self.account_id);
         }
     }
 }
 
-impl ConcurrentAgentPermit {
-    /// Consumes the permit without triggering the drop notification.
-    #[allow(dead_code)]
-    pub fn into_inner(mut self) -> Option<OwnedSemaphorePermit> {
-        self.account = None;
+impl GrantedSlot {
+    /// Take the raw permit out, suppressing this slot's `Drop` bookkeeping.
+    ///
+    /// Used only from `drain_ready_queue` when a `send` to a cancelled waiter
+    /// fails: the slot is returned in the `Err`, but we are still holding the
+    /// account state lock, so letting its `Drop` run would re-enter
+    /// `try_grant_next_sync` and deadlock on the same non-reentrant mutex. The
+    /// caller takes the permit back and performs the accounting inline instead.
+    fn defuse(mut self) -> Option<OwnedSemaphorePermit> {
         self.raw.take()
     }
 }
 
+/// RAII permit returned by [`ConcurrentAgentsScheduler::acquire`].
+///
+/// On drop, decrements the account's running count and wakes the next queued
+/// agent (if any) via the held [`GrantedSlot`]. Unlimited accounts hold a bare
+/// permit with no slot, so dropping them touches no scheduler accounting. The
+/// drop handler is fully synchronous.
+pub struct ConcurrentAgentPermit {
+    /// `Some` for limited accounts (carries the scheduler accounting); `None`
+    /// for unlimited accounts, where `_raw` holds the bare bypass permit. Held
+    /// purely for its `Drop`, which returns the permit and wakes the next
+    /// queued agent.
+    _slot: Option<GrantedSlot>,
+    /// Bare permit for the unlimited-account bypass path. Unused for limited
+    /// accounts (the permit lives inside `_slot`).
+    _raw: Option<OwnedSemaphorePermit>,
+}
+
+impl ConcurrentAgentPermit {
+    /// A permit for a limited account, carrying the scheduler accounting.
+    fn from_slot(slot: GrantedSlot) -> Self {
+        Self {
+            _slot: Some(slot),
+            _raw: None,
+        }
+    }
+
+    /// A permit for an unlimited account: a bare bypass permit with no
+    /// scheduler accounting.
+    fn unlimited(raw: OwnedSemaphorePermit) -> Self {
+        Self {
+            _slot: None,
+            _raw: Some(raw),
+        }
+    }
+}
+
 impl Default for ConcurrentAgentsScheduler {
     fn default() -> Self {
         Self::new()
@@ -156,11 +205,7 @@ impl ConcurrentAgentsScheduler {
         // Unlimited accounts bypass the queue entirely.
         if is_unlimited(limit) {
             let raw = self.permits.acquire(account_id, || async { false }).await;
-            return ConcurrentAgentPermit {
-                raw: Some(raw),
-                account: None,
-                account_id,
-            };
+            return ConcurrentAgentPermit::unlimited(raw);
         }
 
         // Sync the underlying semaphore pool size with the current plan limit
@@ -175,16 +220,12 @@ impl ConcurrentAgentsScheduler {
         let limit = account.resource_entry.max_concurrent_agents_per_executor();
         if is_unlimited(limit) {
             let raw = self.permits.acquire(account_id, || async { false }).await;
-            return ConcurrentAgentPermit {
-                raw: Some(raw),
-                account: None,
-                account_id,
-            };
+            return ConcurrentAgentPermit::unlimited(raw);
         }
 
         enum AcquireDecision {
             FastPath(OwnedSemaphorePermit),
-            Queued(tokio::sync::oneshot::Receiver<OwnedSemaphorePermit>),
+            Queued(tokio::sync::oneshot::Receiver<GrantedSlot>),
         }
 
         let decision = {
@@ -197,7 +238,7 @@ impl ConcurrentAgentsScheduler {
             // After a plan upgrade, newly added semaphore permits may allow
             // queued agents to proceed. Drain what we can before deciding
             // about the current agent.
-            drain_ready_queue(&mut state, &account.raw_semaphore, limit, &account_id);
+            drain_ready_queue(&mut state, &account, limit, &account_id);
 
             // Fast path: capacity available, no older waiters, and the raw
             // semaphore actually has a permit. We try-acquire the semaphore
@@ -239,26 +280,22 @@ impl ConcurrentAgentsScheduler {
                     "ConcurrentAgentsScheduler: fast-path permit for {agent_id} in account {account_id}"
                 );
 
-                ConcurrentAgentPermit {
+                ConcurrentAgentPermit::from_slot(GrantedSlot {
                     raw: Some(raw),
-                    account: Some(account),
+                    account,
                     account_id,
-                }
+                })
             }
             AcquireDecision::Queued(rx) => {
                 debug!(
                     "ConcurrentAgentsScheduler: {agent_id} queued in account {account_id}, waiting for permit"
                 );
 
-                let raw = rx.await.expect(
+                let slot = rx.await.expect(
                     "ConcurrentAgentsScheduler: oneshot sender dropped without sending — scheduler bug",
                 );
 
-                ConcurrentAgentPermit {
-                    raw: Some(raw),
-                    account: Some(account),
-                    account_id,
-                }
+                ConcurrentAgentPermit::from_slot(slot)
             }
         }
     }
@@ -299,7 +336,7 @@ impl ConcurrentAgentsScheduler {
 /// be fully synchronous. Uses `tokio::sync::Semaphore::try_acquire_owned`
 /// (which is synchronous despite being on a tokio type) to acquire permits
 /// for queued agents.
-fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) {
+fn try_grant_next_sync(account: &Arc<AccountScheduler>, account_id: &AccountId) {
     let limit = account.resource_entry.max_concurrent_agents_per_executor();
     if is_unlimited(limit) {
         return;
@@ -308,7 +345,7 @@ fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) {
     let mut state = account.state.lock().unwrap();
     state.running_count = state.running_count.saturating_sub(1);
 
-    drain_ready_queue(&mut state, &account.raw_semaphore, limit, account_id);
+    drain_ready_queue(&mut state, account, limit, account_id);
 }
 
 /// Try to grant permits to queued agents from the front of the ready queue.
@@ -316,9 +353,15 @@ fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) {
 /// Called both from `try_grant_next_sync` (Drop path) and from `acquire`
 /// (after a plan-upgrade sync adds new permits). Fully synchronous — only
 /// uses `try_acquire_owned` which does not block.
+///
+/// Each granted permit is wrapped in a [`GrantedSlot`] carrying the
+/// `running_count` decrement, so a waiter cancelled after a successful send
+/// still releases its slot (via the slot's `Drop` when the oneshot channel is
+/// dropped) rather than leaking the count. The increment here is matched
+/// one-for-one by that slot's `Drop`.
 fn drain_ready_queue(
     state: &mut AccountSchedulerState,
-    raw_semaphore: &Arc<tokio::sync::Semaphore>,
+    account: &Arc<AccountScheduler>,
     limit: u64,
     account_id: &AccountId,
 ) {
@@ -326,13 +369,24 @@ fn drain_ready_queue(
         let queued = state.ready_queue.pop_front().unwrap();
 
         // tokio::sync::Semaphore::try_acquire_owned is synchronous.
-        match raw_semaphore.clone().try_acquire_owned() {
+        match account.raw_semaphore.clone().try_acquire_owned() {
             Ok(raw) => {
                 state.running_count += 1;
-                if queued.waker.send(raw).is_err() {
-                    // Waiter was cancelled; the permit inside the oneshot
-                    // is dropped, returning it to the semaphore. Decrement
-                    // and try next.
+                let slot = GrantedSlot {
+                    raw: Some(raw),
+                    account: account.clone(),
+                    account_id: *account_id,
+                };
+                if let Err(slot) = queued.waker.send(slot) {
+                    // Waiter was cancelled before we could hand it the slot.
+                    // We are still holding the state lock, so we must not let
+                    // the returned slot's `Drop` run (it would re-enter this
+                    // path via `try_grant_next_sync` and deadlock). Defuse it,
+                    // return its permit to the semaphore, and account for it
+                    // inline, then try the next queued agent.
+                    if let Some(raw) = slot.defuse() {
+                        drop(raw);
+                    }
                     state.running_count -= 1;
                     debug!(
                         "ConcurrentAgentsScheduler: waiter {} cancelled in account {account_id}, trying next",
diff --git a/golem-worker-executor/src/services/active_workers/memory_probe.rs b/golem-worker-executor/src/services/active_workers/memory_probe.rs
new file mode 100644
index 0000000000..6940b53db4
--- /dev/null
+++ b/golem-worker-executor/src/services/active_workers/memory_probe.rs
@@ -0,0 +1,238 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Platform-abstracted probe of the executor's real memory usage and limit.
+//!
+//! Reports the measured resident memory and hard limit of the process's
+//! environment, used as the authoritative input to admission decisions (in
+//! contrast to the estimate-based semaphore in [`super::ActiveWorkers`]).
+//!
+//! The trait is abstract over where the limit comes from: a containerised Linux
+//! deployment reads it from the cgroup, an unconstrained process reads host RAM,
+//! a configured override pins it explicitly. Backend fidelity is asymmetric —
+//! cgroup v2 gives the exact kernel-enforced number; other targets fall back to
+//! best-effort process RSS via [`ProcessRssProbe`] until dedicated macOS and
+//! Windows backends land.
+
+use std::fmt::Debug;
+
+/// A snapshot of the executor environment's memory state.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct MemorySnapshot {
+    /// Hard ceiling: cgroup `memory.max` on constrained Linux, configured cap
+    /// or host RAM otherwise. Reaching this with `current` triggers an
+    /// OOM-kill.
+    pub limit_bytes: u64,
+    /// Currently-resident bytes: cgroup `memory.current` on Linux (touched
+    /// pages, lagging but exact), process RSS otherwise.
+    pub current_bytes: u64,
+}
+
+impl MemorySnapshot {
+    /// Bytes between current usage and the hard limit. Saturating: never
+    /// underflows if `current` momentarily exceeds the reported `limit`.
+    pub fn headroom_bytes(&self) -> u64 {
+        self.limit_bytes.saturating_sub(self.current_bytes)
+    }
+}
+
+/// Reads the executor environment's real memory state. Sampled at every
+/// admission attempt, including each wasmtime `memory.grow`, so it must be
+/// cheap: the cgroup v2 backend is two small file reads independent of the
+/// number of resident workers.
+pub trait MemoryProbe: Send + Sync + Debug {
+    fn snapshot(&self) -> MemorySnapshot;
+
+    fn limit_bytes(&self) -> u64 {
+        self.snapshot().limit_bytes
+    }
+
+    fn current_bytes(&self) -> u64 {
+        self.snapshot().current_bytes
+    }
+
+    fn headroom_bytes(&self) -> u64 {
+        self.snapshot().headroom_bytes()
+    }
+}
+
+/// A probe whose limit is fixed at construction and whose current usage comes
+/// from cross-platform process RSS via `sysinfo`.
+///
+/// This is the best-effort fallback used wherever no higher-fidelity backend
+/// is available yet (notably macOS and Windows). It is also used when a
+/// `system_memory_override` pins the limit explicitly.
+#[derive(Debug)]
+pub struct ProcessRssProbe {
+    limit_bytes: u64,
+}
+
+impl ProcessRssProbe {
+    pub fn new(limit_bytes: u64) -> Self {
+        Self { limit_bytes }
+    }
+
+    fn current_rss() -> u64 {
+        let mut sysinfo = sysinfo::System::new();
+        let pid = sysinfo::Pid::from_u32(std::process::id());
+        sysinfo.refresh_processes(sysinfo::ProcessesToUpdate::Some(&[pid]), true);
+        sysinfo.process(pid).map(|p| p.memory()).unwrap_or_default()
+    }
+}
+
+impl MemoryProbe for ProcessRssProbe {
+    fn snapshot(&self) -> MemorySnapshot {
+        MemorySnapshot {
+            limit_bytes: self.limit_bytes,
+            current_bytes: Self::current_rss(),
+        }
+    }
+}
+
+/// A probe with a fixed limit and a fixed current usage, both set at
+/// construction. Reports the same snapshot on every call regardless of the
+/// host. Used by the in-process test harness, where the executor shares its
+/// process (and therefore its real RSS) with the test framework and other
+/// services, so a process-RSS probe cannot isolate this executor's footprint.
+/// Pinning `current_bytes` to a known value (typically 0) makes the gate decide
+/// purely on the granted accounting against the pinned limit, which is exact and
+/// process-isolated, so memory-pressure tests are deterministic.
+#[derive(Debug)]
+pub struct FixedProbe {
+    limit_bytes: u64,
+    current_bytes: u64,
+}
+
+impl FixedProbe {
+    pub fn new(limit_bytes: u64, current_bytes: u64) -> Self {
+        Self {
+            limit_bytes,
+            current_bytes,
+        }
+    }
+}
+
+impl MemoryProbe for FixedProbe {
+    fn snapshot(&self) -> MemorySnapshot {
+        MemorySnapshot {
+            limit_bytes: self.limit_bytes,
+            current_bytes: self.current_bytes,
+        }
+    }
+}
+
+/// Linux cgroup v2 probe. Reads `memory.max` and `memory.current` from the
+/// process's cgroup.
+#[cfg(target_os = "linux")]
+#[derive(Debug)]
+pub struct CgroupV2Probe {
+    /// Resolved path to the cgroup directory, e.g. `/sys/fs/cgroup`.
+    base: std::path::PathBuf,
+    /// Fallback limit used when `memory.max` reads `max` (unlimited) — usually
+    /// host RAM or the configured override.
+    fallback_limit_bytes: u64,
+}
+
+#[cfg(target_os = "linux")]
+impl CgroupV2Probe {
+    const DEFAULT_BASE: &'static str = "/sys/fs/cgroup";
+
+    /// Attempts to construct a cgroup v2 probe. Returns `None` when the host is
+    /// not running cgroup v2 (no unified `memory.current` at the base path), so
+    /// the caller can fall back to [`ProcessRssProbe`].
+    pub fn try_new(fallback_limit_bytes: u64) -> Option<Self> {
+        let base = std::path::PathBuf::from(Self::DEFAULT_BASE);
+        // cgroup v2 unified hierarchy exposes memory.current directly at the
+        // delegated cgroup path. If it is not readable we are not on v2.
+        if std::fs::read_to_string(base.join("memory.current")).is_ok() {
+            Some(Self {
+                base,
+                fallback_limit_bytes,
+            })
+        } else {
+            None
+        }
+    }
+
+    fn read_u64(&self, file: &str) -> Option<u64> {
+        let raw = std::fs::read_to_string(self.base.join(file)).ok()?;
+        raw.trim().parse::<u64>().ok()
+    }
+
+    fn read_limit(&self) -> u64 {
+        // memory.max contains either a number of bytes or the literal "max".
+        match std::fs::read_to_string(self.base.join("memory.max")) {
+            Ok(raw) => {
+                let trimmed = raw.trim();
+                if trimmed == "max" {
+                    self.fallback_limit_bytes
+                } else {
+                    trimmed.parse::<u64>().unwrap_or(self.fallback_limit_bytes)
+                }
+            }
+            Err(_) => self.fallback_limit_bytes,
+        }
+    }
+}
+
+#[cfg(target_os = "linux")]
+impl MemoryProbe for CgroupV2Probe {
+    fn snapshot(&self) -> MemorySnapshot {
+        MemorySnapshot {
+            limit_bytes: self.read_limit(),
+            current_bytes: self.read_u64("memory.current").unwrap_or(0),
+        }
+    }
+}
+
+/// Constructs the best available probe.
+///
+/// When `memory_override` is set, the limit is self-declared and treated as an
+/// isolated budget measured against this process's RSS — the executor does not
+/// assume it owns a cgroup. When it is `None`, the executor is assumed to own
+/// its memory environment, so on Linux the exact cgroup v2 numbers are used
+/// (falling back to host RAM / process RSS otherwise).
+pub fn default_probe(memory_override: Option<u64>) -> Box<dyn MemoryProbe> {
+    if let Some(limit) = memory_override {
+        tracing::info!(
+            limit_bytes = limit,
+            "Memory probe: ProcessRssProbe (limit pinned by system_memory_override)"
+        );
+        return Box::new(ProcessRssProbe::new(limit));
+    }
+
+    let host_ram = {
+        let mut sysinfo = sysinfo::System::new();
+        sysinfo.refresh_memory();
+        sysinfo.total_memory()
+    };
+
+    #[cfg(target_os = "linux")]
+    {
+        if let Some(probe) = CgroupV2Probe::try_new(host_ram) {
+            let snapshot = probe.snapshot();
+            tracing::info!(
+                limit_bytes = snapshot.limit_bytes,
+                current_bytes = snapshot.current_bytes,
+                "Memory probe: CgroupV2Probe (cgroup memory.max/current)"
+            );
+            return Box::new(probe);
+        }
+    }
+    tracing::info!(
+        limit_bytes = host_ram,
+        "Memory probe: ProcessRssProbe (host RAM, no cgroup v2 limit)"
+    );
+    Box::new(ProcessRssProbe::new(host_ram))
+}
diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs
index 3a9ece958b..24784065b4 100644
--- a/golem-worker-executor/src/services/active_workers/mod.rs
+++ b/golem-worker-executor/src/services/active_workers/mod.rs
@@ -12,9 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+pub mod admission;
+pub mod component_charge;
 pub mod concurrent_agents_scheduler;
 pub mod concurrent_agents_semaphore;
 pub mod fs_semaphore;
+pub mod memory_probe;
 #[cfg(test)]
 mod tests;
 
@@ -26,9 +29,14 @@ pub use fs_semaphore::{
     filesystem_storage_permits_to_bytes, filesystem_storage_pool_bytes_to_permits,
 };
 
+pub(crate) use admission::MemoryGrant;
+use admission::{AdmissionController, EvictionPriority, EvictionSource};
+use async_trait::async_trait;
+pub use component_charge::HeldComponentCharge;
+use component_charge::{ChargeSource, ComponentChargeGuard, ComponentChargeRegistry};
+use memory_probe::{MemoryProbe, default_probe};
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::sync::{Mutex, OwnedSemaphorePermit, Semaphore, TryAcquireError};
 
 use tracing::{Instrument, debug};
 
@@ -40,7 +48,7 @@ use crate::workerctx::WorkerCtx;
 use golem_common::cache::{BackgroundEvictionMode, Cache, FullCacheEvictionMode, SimpleCache};
 use golem_common::model::account::AccountId;
 use golem_common::model::agent::Principal;
-use golem_common::model::component::ComponentRevision;
+use golem_common::model::component::{ComponentId, ComponentRevision};
 use golem_common::model::environment::EnvironmentId;
 use golem_common::model::invocation_context::InvocationContextStack;
 use golem_common::model::worker::AgentConfigEntryDto;
@@ -65,71 +73,96 @@ impl RegisteredConcurrentAccount {
 /// Holds the metadata and wasmtime structures of currently active Golem workers
 pub struct ActiveWorkers<Ctx: WorkerCtx> {
     workers: Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
-    worker_memory: Arc<Semaphore>,
     worker_filesystem_storage: Arc<FilesystemStorageSemaphore>,
     concurrent_agents: Arc<ConcurrentAgentsScheduler>,
-    priority_allocation_lock: Arc<Mutex<()>>,
     acquire_retry_delay: Duration,
+    /// Authoritative measured-headroom admission gate, and the sole admission
+    /// authority. Decides whether real memory headroom permits a new
+    /// acquisition, evicting via the worker set when short. `None` when measured
+    /// admission is disabled (e.g. shared test environments), in which case
+    /// acquisition always proceeds.
+    admission: Option<Arc<AdmissionController>>,
+    /// Reserves each resident component's compiled module size with the gate
+    /// exactly once (shared across all its workers) rather than per worker, so
+    /// the module's resident cost is accounted before it faults into memory.
+    component_charges: Arc<ComponentChargeRegistry<ComponentChargeKey, GateChargeSource>>,
+    /// Multiplier applied to a component's `component_size` when sizing its
+    /// module charge.
+    component_size_coefficient: f64,
 }
 
-#[derive(Debug)]
-pub struct WorkerMemoryPermit {
-    permit: Option<OwnedSemaphorePermit>,
-}
-
-impl WorkerMemoryPermit {
-    fn new(permit: OwnedSemaphorePermit) -> Self {
-        crate::metrics::workers::record_memory_permit_acquired(permit.num_permits());
-        Self {
-            permit: Some(permit),
-        }
-    }
+/// Identifies a compiled component for module-charge accounting.
+type ComponentChargeKey = (ComponentId, ComponentRevision);
 
-    pub fn num_permits(&self) -> usize {
-        self.permit
-            .as_ref()
-            .map_or(0, |permit| permit.num_permits())
-    }
-
-    pub fn merge(&mut self, mut other: Self) {
-        if let Some(other_permit) = other.permit.take() {
-            match &mut self.permit {
-                Some(permit) => permit.merge(other_permit),
-                None => self.permit = Some(other_permit),
-            }
-        }
-    }
-}
-
-impl Drop for WorkerMemoryPermit {
-    fn drop(&mut self) {
-        crate::metrics::workers::record_memory_permit_released(self.num_permits());
-    }
-}
+/// Guard held by a resident worker keeping its component's module charge alive.
+pub type WorkerComponentCharge = ComponentChargeGuard<ComponentChargeKey, GateChargeSource>;
 
 impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
     pub fn new(memory_config: &MemoryConfig, storage_config: &FilesystemStorageConfig) -> Self {
-        let worker_memory_size = memory_config.worker_memory();
+        // Build the probe once and hand it to the measured-headroom gate, which
+        // bases its decision on the pod's cgroup limit when constrained (not host
+        // RAM).
+        let probe = default_probe(memory_config.system_memory_override);
+        Self::new_with_probe(probe, memory_config, storage_config)
+    }
+
+    /// Like [`Self::new`] but with an explicitly provided memory probe instead of
+    /// the one derived from the config. The in-process test harness uses this to
+    /// supply a probe with a pinned limit and current usage, so the gate's
+    /// decision is deterministic and isolated from the shared test process's RSS.
+    pub fn new_with_probe(
+        probe: Box<dyn MemoryProbe>,
+        memory_config: &MemoryConfig,
+        storage_config: &FilesystemStorageConfig,
+    ) -> Self {
+        let admission = memory_config.enable_measured_admission.then(|| {
+            Arc::new(AdmissionController::new(
+                probe,
+                memory_config.admission_policy(),
+            ))
+        });
+        let workers = Cache::new(
+            None,
+            FullCacheEvictionMode::None,
+            BackgroundEvictionMode::None,
+            "active_workers",
+        );
+        let component_charges = ComponentChargeRegistry::new(GateChargeSource {
+            admission: admission.clone(),
+        });
         let active_workers = Self {
-            workers: Cache::new(
-                None,
-                FullCacheEvictionMode::None,
-                BackgroundEvictionMode::None,
-                "active_workers",
-            ),
-            worker_memory: Arc::new(Semaphore::new(worker_memory_size)),
+            workers,
             worker_filesystem_storage: Arc::new(FilesystemStorageSemaphore::new(
                 storage_config.worker_filesystem_storage(),
                 storage_config.acquire_retry_delay,
             )),
             concurrent_agents: Arc::new(ConcurrentAgentsScheduler::new()),
             acquire_retry_delay: memory_config.acquire_retry_delay,
-            priority_allocation_lock: Arc::new(Mutex::new(())),
+            admission,
+            component_charges,
+            component_size_coefficient: memory_config.component_size_coefficient,
         };
-        active_workers.initialize_metrics(worker_memory_size);
+        active_workers.initialize_metrics();
         active_workers
     }
 
+    /// Acquire (or share) the per-component module charge for a worker of the
+    /// given component. The first resident worker of the component reserves its
+    /// compiled-module size (scaled by `component_size_coefficient`) with the
+    /// gate; subsequent workers share the same charge. The returned guard
+    /// releases the charge when the last worker of the component unloads.
+    pub async fn acquire_component_charge(
+        &self,
+        component_id: ComponentId,
+        component_revision: ComponentRevision,
+        component_module_bytes: u64,
+    ) -> WorkerComponentCharge {
+        let charge_bytes = (self.component_size_coefficient * component_module_bytes as f64) as u64;
+        self.component_charges
+            .acquire((component_id, component_revision), charge_bytes)
+            .await
+    }
+
     pub async fn get_or_add<T>(
         &self,
         deps: &T,
@@ -202,159 +235,60 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
         }
     }
 
-    pub async fn acquire(&self, memory: u64) -> WorkerMemoryPermit {
-        let mem32: u32 = memory
-            .try_into()
-            .expect("requested memory size is too large");
-
+    /// Blocking memory admission for a starting worker. Loops until the gate
+    /// admits the request, backing off between attempts, and returns a
+    /// [`MemoryGrant`] guard owning the reservation: the worker holds it for as
+    /// long as it is resident and releases it by dropping the guard, so a start
+    /// cancelled before the worker becomes resident cannot leak the reservation.
+    ///
+    /// A rejection is transient, not terminal. The gate reads resident memory
+    /// from the probe, which lags real usage (cgroup `memory.current` only counts
+    /// already-touched pages), so a worker admitted earlier may not yet be fully
+    /// resident; pressure eases as its pages settle and as other workers finish.
+    /// Each iteration backs off and re-reads the gate, so the caller eventually
+    /// proceeds once headroom recovers rather than failing under momentary
+    /// pressure. With measured admission disabled the worker is admitted
+    /// immediately with an inert grant.
+    pub(crate) async fn acquire(&self, memory: u64) -> MemoryGrant {
+        let Some(admission) = &self.admission else {
+            return MemoryGrant::inert();
+        };
         loop {
-            let available = self.worker_memory.available_permits();
-            let lock = self.priority_allocation_lock.lock().await; // Block trying until a priority request is retrying once
-            let result = self.worker_memory.clone().try_acquire_many_owned(mem32);
-            drop(lock);
-            match result {
-                Ok(permit) => {
-                    debug!(
-                        "Acquired {} memory of {}, new available: {}, permit size: {}",
-                        mem32,
-                        available,
-                        self.worker_memory.available_permits(),
-                        permit.num_permits()
-                    );
-                    break WorkerMemoryPermit::new(permit);
-                }
-                Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"),
-                Err(TryAcquireError::NoPermits) => {
-                    debug!(
-                        "Not enough memory to allocate {mem32} (available: {}), trying to free some up",
-                        self.worker_memory.available_permits()
-                    );
-                    if self.try_free_up_memory(memory).await {
-                        debug!("Freed up some memory, retrying");
-                        // We have enough memory unless another worker has taken it in the meantime,
-                        // so retry the loop
-                        continue;
-                    } else {
-                        debug!(
-                            "Could not free up memory, retrying asking for permits after some time"
-                        );
-                        // Could not free up enough memory, so waiting for permits to be available.
-                        // We cannot use acquire_many() to wait for the permits because it eagerly preallocates
-                        // the available permits, and by that causing deadlocks. So we sleep and retry.
-
-                        tokio::time::sleep(self.acquire_retry_delay).await;
-                    }
-                }
+            // Evicts idle-then-warm when real headroom is short; rejects (and we
+            // back off) when it cannot make room rather than risking the limit.
+            if let Some(grant) = admission.admit(memory, &self.eviction_source()).await {
+                return grant;
             }
+            debug!("Measured headroom insufficient for {memory}, backing off and retrying");
+            tokio::time::sleep(self.acquire_retry_delay).await;
         }
     }
 
-    pub async fn try_acquire(&self, memory: u64) -> Option<WorkerMemoryPermit> {
-        let mem32: u32 = memory
-            .try_into()
-            .expect("requested memory size is too large");
-        let mut lock = None;
-        loop {
-            match self.worker_memory.clone().try_acquire_many_owned(mem32) {
-                Ok(permit) => {
-                    debug!(
-                        "Acquired {} memory of {}",
-                        mem32,
-                        self.worker_memory.available_permits()
-                    );
-                    break Some(WorkerMemoryPermit::new(permit));
-                }
-                Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"),
-                Err(TryAcquireError::NoPermits) => {
-                    if lock.is_none() {
-                        debug!(
-                            "Not enough available memory to acquire {mem32} (available: {}), cancelling waiting acquires and retry",
-                            self.worker_memory.available_permits()
-                        );
-                        lock = Some(self.priority_allocation_lock.lock().await);
-                        continue;
-                    } else {
-                        debug!(
-                            "Not enough available memory to acquire {mem32} (available: {})",
-                            self.worker_memory.available_permits()
-                        );
-                        break None;
-                    }
-                }
-            }
+    /// Builds an [`EvictionSource`] view over the live worker set for the
+    /// admission controller to reclaim memory through.
+    fn eviction_source(&self) -> WorkerEvictionSource<Ctx> {
+        WorkerEvictionSource {
+            workers: self.workers.clone(),
         }
     }
 
-    async fn try_free_up_memory(&self, memory: u64) -> bool {
-        let current_avail = self.worker_memory.available_permits();
-        let needed = memory.saturating_sub(current_avail as u64);
-
-        if needed > 0 {
-            let mut idle_candidates = Vec::new();
-            let mut warm_candidates = Vec::new();
-
-            debug!("Collecting memory eviction candidates");
-            let pairs = self.workers.iter().await;
-            for (agent_id, worker) in pairs {
-                if let Some(class) = worker.eviction_class().await
-                    && let Ok(mem) = worker.memory_requirement().await
-                {
-                    let last_changed = worker.last_execution_state_change();
-                    let entry = (agent_id, worker, mem, last_changed);
-                    match class {
-                        crate::worker::EvictionClass::LoadedIdle => {
-                            idle_candidates.push(entry);
-                        }
-                        crate::worker::EvictionClass::WarmRunnable => {
-                            warm_candidates.push(entry);
-                        }
-                    }
-                }
-            }
-
-            // Sort each bucket by timestamp — newest first so we pop oldest
-            idle_candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis());
-            idle_candidates.reverse();
-            warm_candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis());
-            warm_candidates.reverse();
-
-            let mut freed = 0u64;
-
-            // First evict LoadedIdle workers (cheapest)
-            while freed < needed && !idle_candidates.is_empty() {
-                let (agent_id, worker, mem, _) = idle_candidates.pop().unwrap();
-                debug!("Trying to stop idle {agent_id} to free up memory");
-                if worker
-                    .stop_if_evictable(crate::worker::EvictionClass::LoadedIdle)
-                    .await
-                {
-                    debug!("Stopped idle {agent_id} to free up {mem} memory");
-                    crate::metrics::workers::record_worker_eviction("LoadedIdle");
-                    freed += mem;
-                }
-            }
-
-            // Then evict WarmRunnable workers if still under pressure
-            while freed < needed && !warm_candidates.is_empty() {
-                let (agent_id, worker, mem, _) = warm_candidates.pop().unwrap();
-                debug!("Trying to stop warm-runnable {agent_id} to free up memory");
-                if worker
-                    .stop_if_evictable(crate::worker::EvictionClass::WarmRunnable)
-                    .await
-                {
-                    debug!("Stopped warm-runnable {agent_id} to free up {mem} memory");
-                    crate::metrics::workers::record_worker_eviction("WarmRunnable");
-                    freed += mem;
-                }
-            }
-
-            if freed > 0 {
-                debug!("Freed up {freed}");
+    /// Non-blocking memory admission for a growing worker. A single gate attempt:
+    /// returns the additional [`MemoryGrant`] when the grow is admitted, or `None`
+    /// when real headroom is insufficient even after eviction (the caller turns
+    /// `None` into a retriable out-of-memory trap). The returned grant should be
+    /// merged into the worker's existing grant so its whole reservation is
+    /// released together on unload. With measured admission disabled the grow is
+    /// always admitted with an inert grant.
+    pub(crate) async fn try_acquire(&self, memory: u64) -> Option<MemoryGrant> {
+        let Some(admission) = &self.admission else {
+            return Some(MemoryGrant::inert());
+        };
+        match admission.admit(memory, &self.eviction_source()).await {
+            Some(grant) => Some(grant),
+            None => {
+                debug!("Measured headroom insufficient for {memory}, not admitting");
+                None
             }
-            freed >= needed
-        } else {
-            debug!("Memory was freed up in the meantime");
-            true
         }
     }
 
@@ -471,11 +405,111 @@ impl<Ctx: WorkerCtx> ActiveWorkers<Ctx> {
     }
 
     /// Initializes worker gauges. Subsequent changes are recorded inline at the mutation sites.
-    fn initialize_metrics(&self, worker_memory_size: usize) {
+    fn initialize_metrics(&self) {
         crate::metrics::workers::initialize_worker_metrics();
         crate::metrics::workers::set_filesystem_semaphore_available(
             self.worker_filesystem_storage.available_bytes(),
         );
-        crate::metrics::storage::record_worker_memory_pool_total(worker_memory_size as u64);
+    }
+}
+
+impl From<EvictionPriority> for crate::worker::EvictionClass {
+    fn from(priority: EvictionPriority) -> Self {
+        match priority {
+            EvictionPriority::Idle => crate::worker::EvictionClass::LoadedIdle,
+            EvictionPriority::Warm => crate::worker::EvictionClass::WarmRunnable,
+        }
+    }
+}
+
+/// Evicts resident workers at a single priority tier, oldest-first, stopping
+/// once at least `needed_bytes` have been freed or the tier is exhausted.
+/// Returns the bytes actually reclaimed.
+async fn evict_at_most_memory<Ctx: WorkerCtx>(
+    workers: &Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
+    priority: EvictionPriority,
+    needed_bytes: u64,
+) -> u64 {
+    let target_class: crate::worker::EvictionClass = priority.into();
+
+    let mut candidates = Vec::new();
+    for (agent_id, worker) in workers.iter().await {
+        if let Some(class) = worker.eviction_class().await
+            && class == target_class
+            && let Ok(mem) = worker.memory_requirement().await
+        {
+            let last_changed = worker.last_execution_state_change();
+            candidates.push((agent_id, worker, mem, last_changed));
+        }
+    }
+
+    // Sort by timestamp newest-first so we pop the oldest first.
+    candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis());
+    candidates.reverse();
+
+    let mut freed = 0u64;
+    while freed < needed_bytes && !candidates.is_empty() {
+        let (agent_id, worker, mem, _) = candidates.pop().unwrap();
+        debug!("Trying to stop {target_class:?} {agent_id} to free up memory");
+        if worker.stop_if_evictable(target_class).await {
+            debug!("Stopped {target_class:?} {agent_id} to free up {mem} memory");
+            crate::metrics::workers::record_worker_eviction(match priority {
+                EvictionPriority::Idle => "LoadedIdle",
+                EvictionPriority::Warm => "WarmRunnable",
+            });
+            freed += mem;
+        }
+    }
+    freed
+}
+
+/// A source of evictable, already-resident memory the gate reclaims through.
+struct WorkerEvictionSource<Ctx: WorkerCtx> {
+    workers: Cache<AgentId, (), Arc<Worker<Ctx>>, WorkerExecutorError>,
+}
+
+#[async_trait]
+impl<Ctx: WorkerCtx> EvictionSource for WorkerEvictionSource<Ctx> {
+    async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64 {
+        evict_at_most_memory(&self.workers, priority, needed_bytes).await
+    }
+}
+
+/// Production [`ChargeSource`] for the per-component module charge: reserves the
+/// module's bytes with the measured-headroom gate. The module is a committed
+/// consequence of admitting the first worker of a component (it loads into RAM
+/// when that worker becomes resident), so it is reserved rather than admitted —
+/// it neither evicts nor can be refused. `None` when measured admission is
+/// disabled, in which case the charge is a no-op.
+pub struct GateChargeSource {
+    admission: Option<Arc<AdmissionController>>,
+}
+
+/// Held module charge: releases its reserved bytes from the gate on drop.
+pub struct GateCharge {
+    admission: Option<Arc<AdmissionController>>,
+    bytes: u64,
+}
+
+impl Drop for GateCharge {
+    fn drop(&mut self) {
+        if let Some(admission) = &self.admission {
+            admission.release(self.bytes);
+        }
+    }
+}
+
+#[async_trait]
+impl ChargeSource for GateChargeSource {
+    type Charge = GateCharge;
+
+    async fn acquire_charge(&self, bytes: u64) -> GateCharge {
+        if let Some(admission) = &self.admission {
+            admission.reserve_committed(bytes);
+        }
+        GateCharge {
+            admission: self.admission.clone(),
+            bytes,
+        }
     }
 }
diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs
index 82430c243b..217c0e21b6 100644
--- a/golem-worker-executor/src/services/active_workers/tests.rs
+++ b/golem-worker-executor/src/services/active_workers/tests.rs
@@ -729,3 +729,556 @@ async fn scheduler_accounts_are_independent() {
     drop(a1);
     drop(a2);
 }
+
+// ── Component module charge against the admission gate ───────────────────────
+
+mod component_module_charge {
+    use super::super::admission::{AdmissionController, AdmissionPolicy};
+    use super::super::component_charge::ComponentChargeRegistry;
+    use super::super::memory_probe::{MemoryProbe, MemorySnapshot};
+    use super::super::{ComponentChargeKey, GateChargeSource, HeldComponentCharge};
+    use golem_common::model::component::{ComponentId, ComponentRevision};
+    use std::sync::Arc;
+    use test_r::test;
+    use uuid::Uuid;
+
+    /// Probe reporting a fixed limit and zero resident memory, so the gate's
+    /// reservation is driven entirely by what is charged through it.
+    #[derive(Debug)]
+    struct FixedProbe {
+        limit: u64,
+    }
+
+    impl MemoryProbe for FixedProbe {
+        fn snapshot(&self) -> MemorySnapshot {
+            MemorySnapshot {
+                limit_bytes: self.limit,
+                current_bytes: 0,
+            }
+        }
+    }
+
+    fn key() -> ComponentChargeKey {
+        (ComponentId(Uuid::new_v4()), ComponentRevision::INITIAL)
+    }
+
+    /// The first worker of a component reserves the module's bytes with the gate,
+    /// so admissible headroom drops by the module size before it faults into
+    /// memory. A second worker of the same component reserves nothing more, and
+    /// the reservation is released only when the last worker unloads.
+    #[test]
+    async fn module_charge_reserves_with_gate_until_last_worker_unloads() {
+        let limit = 1000u64;
+        let module_bytes = 200u64;
+        let controller = Arc::new(AdmissionController::new(
+            Box::new(FixedProbe { limit }),
+            AdmissionPolicy { usable_ratio: 1.0 },
+        ));
+        let registry = ComponentChargeRegistry::new(GateChargeSource {
+            admission: Some(controller.clone()),
+        });
+        let component = key();
+
+        assert_eq!(controller.headroom_bytes(), limit);
+
+        let first = registry.acquire(component, module_bytes).await;
+        assert_eq!(
+            controller.headroom_bytes(),
+            limit - module_bytes,
+            "first worker of a component must reserve the module size with the gate"
+        );
+
+        let second = registry.acquire(component, module_bytes).await;
+        assert_eq!(
+            controller.headroom_bytes(),
+            limit - module_bytes,
+            "a second worker of the same component must not reserve the module again"
+        );
+
+        drop(first);
+        assert_eq!(
+            controller.headroom_bytes(),
+            limit - module_bytes,
+            "the module stays reserved while any worker of the component is resident"
+        );
+
+        drop(second);
+        assert_eq!(
+            controller.headroom_bytes(),
+            limit,
+            "the module reservation is released when the last worker unloads"
+        );
+    }
+
+    /// A `RunningWorker` stores its component charge as
+    /// `Box<dyn HeldComponentCharge>` and releases it by dropping that box when
+    /// the worker unloads. Dropping the box must still release the module
+    /// reservation with the gate, i.e. the concrete charge's release runs through
+    /// the trait object exactly as it would for a live worker.
+    #[test]
+    async fn dropping_boxed_charge_releases_the_reservation() {
+        let limit = 1000u64;
+        let module_bytes = 200u64;
+        let controller = Arc::new(AdmissionController::new(
+            Box::new(FixedProbe { limit }),
+            AdmissionPolicy { usable_ratio: 1.0 },
+        ));
+        let registry = ComponentChargeRegistry::new(GateChargeSource {
+            admission: Some(controller.clone()),
+        });
+
+        let charge = registry.acquire(key(), module_bytes).await;
+        // Store it exactly as RunningWorker does.
+        let boxed: Box<dyn HeldComponentCharge> = Box::new(charge);
+        assert_eq!(controller.headroom_bytes(), limit - module_bytes);
+
+        drop(boxed);
+        assert_eq!(
+            controller.headroom_bytes(),
+            limit,
+            "dropping the boxed charge (as on worker unload) must release the reservation"
+        );
+    }
+}
+
+// ── ConcurrentAgentsScheduler — model-based liveness property ────────────────
+//
+// The scheduler keeps its own `running_count` integer alongside the real tokio
+// semaphore permits. The two must stay in lockstep: every increment of
+// `running_count` must be matched by exactly one decrement, regardless of how a
+// granted slot is disposed of (released by a live worker, or dropped inside a
+// cancelled waiter's oneshot channel). If they drift, the scheduler wedges —
+// `running_count` sticks at the limit while permits are actually free, and
+// every future acquire queues forever. This is the production deadlock the
+// property is designed to catch.
+//
+// The model drives random interleavings of acquire / release / cancel against
+// the real scheduler and, after every step, asserts the *liveness* invariant:
+// whenever fewer permits are genuinely held than the limit allows, a fresh
+// acquire must succeed promptly. A leaked `running_count` violates this.
+mod scheduler_liveness {
+    use super::super::concurrent_agents_scheduler::{
+        ConcurrentAgentPermit, ConcurrentAgentsScheduler,
+    };
+    use super::{account, agent, resource_entry_with_agent_limit};
+    use proptest::prelude::*;
+    use std::sync::Arc;
+    use std::time::Duration;
+    use test_r::test;
+    use tokio::task::JoinHandle;
+
+    /// One step in a randomized scheduler workload.
+    #[derive(Debug, Clone)]
+    enum Op {
+        /// Acquire a permit and hold it (resolves immediately if capacity is
+        /// free, otherwise the in-flight acquire is parked in `pending`).
+        Acquire,
+        /// Release a currently-held permit, if any.
+        Release(prop::sample::Index),
+        /// Cancel an in-flight (likely queued) acquire, if any. Exercises both
+        /// "cancelled while queued" and "cancelled just after being granted".
+        CancelPending(prop::sample::Index),
+        /// Release a held permit and, in the same step, cancel an in-flight
+        /// acquire. This is the deadly race: the released slot may be granted
+        /// to the in-flight acquire's oneshot and then the acquire is cancelled
+        /// before it can receive it. The slot must still be released.
+        ReleaseThenCancel(prop::sample::Index, prop::sample::Index),
+    }
+
+    fn arb_ops() -> impl Strategy<Value = Vec<Op>> {
+        prop::collection::vec(
+            prop_oneof![
+                3 => Just(Op::Acquire),
+                2 => any::<prop::sample::Index>().prop_map(Op::Release),
+                2 => any::<prop::sample::Index>().prop_map(Op::CancelPending),
+                3 => (any::<prop::sample::Index>(), any::<prop::sample::Index>())
+                    .prop_map(|(a, b)| Op::ReleaseThenCancel(a, b)),
+            ],
+            1..60,
+        )
+    }
+
+    /// Let any synchronous grant/drain bookkeeping triggered by a release or
+    /// cancellation settle before the next observation.
+    async fn settle() {
+        for _ in 0..8 {
+            tokio::task::yield_now().await;
+        }
+        tokio::time::sleep(Duration::from_millis(1)).await;
+    }
+
+    proptest! {
+        // Cap shrink iterations so a failing (buggy) run cannot spend minutes
+        // re-running wedging inputs against the overall timeout while shrinking.
+        #![proptest_config(ProptestConfig { cases: 128, max_shrink_iters: 64, ..ProptestConfig::default() })]
+
+        /// Liveness: under any interleaving of acquire / release / cancel, the
+        /// scheduler never wedges. After each step, if fewer permits are held
+        /// than the limit, a fresh acquire must succeed within a short timeout.
+        /// At the end, draining all held permits must let the account return to
+        /// full capacity.
+        #[test]
+        fn scheduler_never_wedges_under_churn(
+            limit in 1usize..6,
+            ops in arb_ops(),
+        ) {
+            let rt = tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(2)
+                .enable_time()
+                .build()
+                .unwrap();
+
+            rt.block_on(async move {
+                // Bound the whole case so a wedge fails fast and deterministically
+                // rather than hanging the test suite. A correct scheduler completes
+                // a 60-op workload in well under a second; the bug deadlocks here,
+                // so a tight bound makes the failure (and any shrinking) quick.
+                let outcome = tokio::time::timeout(Duration::from_secs(3), async move {
+                    run_workload(limit, ops).await
+                })
+                .await;
+
+                match outcome {
+                    Ok(result) => result,
+                    Err(_elapsed) => Err(TestCaseError::fail(
+                        "scheduler workload did not complete within the overall timeout — \
+                         deadlock (running_count leaked above true occupancy)",
+                    )),
+                }
+            })?;
+        }
+    }
+
+    /// Drives one randomized workload against a freshly-registered account and
+    /// returns `Err` if the liveness invariant is ever violated. Factored out of
+    /// the proptest body so the whole run can be wrapped in an overall timeout.
+    async fn run_workload(limit: usize, ops: Vec<Op>) -> Result<(), TestCaseError> {
+        // Short per-acquire timeout: a wedge must surface quickly, but allow
+        // enough slack for genuine multi-thread scheduling jitter.
+        const PROBE_TIMEOUT: Duration = Duration::from_millis(500);
+
+        let sched = Arc::new(ConcurrentAgentsScheduler::new());
+        let acc = account();
+        sched
+            .register_account(acc, resource_entry_with_agent_limit(limit as u64))
+            .await;
+
+        // Permits we are deliberately holding (count against the limit).
+        let mut held: Vec<ConcurrentAgentPermit> = Vec::new();
+        // In-flight acquires not yet resolved (queued or just granted).
+        let mut pending: Vec<JoinHandle<ConcurrentAgentPermit>> = Vec::new();
+        let mut counter = 0usize;
+
+        for op in ops {
+            match op {
+                Op::Acquire => {
+                    counter += 1;
+                    let sched = sched.clone();
+                    let name = format!("W{counter}");
+                    let handle =
+                        tokio::spawn(async move { sched.acquire(acc, agent(&name)).await });
+                    pending.push(handle);
+                }
+                Op::Release(idx) => {
+                    if !held.is_empty() {
+                        let i = idx.index(held.len());
+                        drop(held.remove(i));
+                    }
+                }
+                Op::CancelPending(idx) => {
+                    if !pending.is_empty() {
+                        let i = idx.index(pending.len());
+                        pending.remove(i).abort();
+                    }
+                }
+                Op::ReleaseThenCancel(ri, ci) => {
+                    if !held.is_empty() {
+                        let i = ri.index(held.len());
+                        drop(held.remove(i));
+                    }
+                    if !pending.is_empty() {
+                        let i = ci.index(pending.len());
+                        pending.remove(i).abort();
+                    }
+                }
+            }
+
+            settle().await;
+
+            // Collect any in-flight acquires that have now resolved into
+            // held permits, so `held.len()` reflects true occupancy.
+            let mut still_pending = Vec::new();
+            for h in pending.drain(..) {
+                if h.is_finished() {
+                    if let Ok(permit) = h.await {
+                        held.push(permit);
+                    }
+                    // Cancelled/aborted handles are simply dropped.
+                } else {
+                    still_pending.push(h);
+                }
+            }
+            pending = still_pending;
+
+            // Liveness invariant: if we are below the limit, a fresh
+            // acquire must succeed promptly. A leaked running_count
+            // would make this hang and trip the timeout.
+            if held.len() < limit {
+                let probe =
+                    tokio::time::timeout(PROBE_TIMEOUT, sched.acquire(acc, agent("probe"))).await;
+                prop_assert!(
+                    probe.is_ok(),
+                    "scheduler wedged: held {} < limit {} but acquire timed out",
+                    held.len(),
+                    limit,
+                );
+                // Release the probe immediately.
+                drop(probe.ok());
+                settle().await;
+            }
+        }
+
+        // Abort everything still queued, drop all held permits, and
+        // confirm the account drains back to full capacity: `limit`
+        // fresh acquires must all succeed.
+        for h in pending.drain(..) {
+            h.abort();
+            let _ = h.await;
+        }
+        held.clear();
+        settle().await;
+
+        let mut drained = Vec::new();
+        for _ in 0..limit {
+            let p = tokio::time::timeout(PROBE_TIMEOUT, sched.acquire(acc, agent("drain"))).await;
+            prop_assert!(
+                p.is_ok(),
+                "scheduler did not return to full capacity after churn",
+            );
+            drained.push(p.unwrap());
+        }
+        Ok(())
+    }
+}
+
+// ── Grant-guard liveness under random churn ──────────────────────────────────
+//
+// A worker's memory grant is reserved with the admission gate and then owned by
+// a guard that lives in one of three places over the worker's lifetime: in the
+// in-flight start task (waiting for permits), in the resident worker (started),
+// or dropped (the worker exited or its start was cancelled). The liveness
+// invariant — mirroring `scheduler_liveness` for the concurrent-agents scheduler
+// — is that however the guard travels between those places, the gate's
+// accounting stays symmetric: once every guard is gone, admissible headroom
+// returns to the full ceiling. A reservation released zero times (leak, the
+// cancelled-while-waiting deletion bug) or more than once (double-release) breaks
+// it. With a zero-usage probe, headroom is `ceiling - granted`, so the final
+// headroom reads the granted total directly.
+mod grant_guard_liveness {
+    use super::super::admission::{
+        AdmissionController, AdmissionPolicy, EvictionPriority, EvictionSource, MemoryGrant,
+    };
+    use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot};
+    use proptest::prelude::*;
+    use std::sync::Arc;
+    use std::time::Duration;
+    use test_r::test;
+    use tokio::task::JoinHandle;
+
+    /// Probe with a fixed limit reporting zero resident usage, so admissible
+    /// headroom equals `ceiling - granted` and reads the granted accounting
+    /// directly — the quantity a leaked or double-released grant corrupts.
+    #[derive(Debug)]
+    struct ZeroUsageProbe {
+        limit: u64,
+    }
+
+    impl MemoryProbe for ZeroUsageProbe {
+        fn snapshot(&self) -> MemorySnapshot {
+            MemorySnapshot {
+                limit_bytes: self.limit,
+                current_bytes: 0,
+            }
+        }
+    }
+
+    /// Nothing to evict: a rejected request stays rejected (the schedule keeps
+    /// total grants within the ceiling so admission only fails transiently, never
+    /// due to a leak the gate could not see).
+    struct NoEvictionSource;
+
+    #[async_trait::async_trait]
+    impl EvictionSource for NoEvictionSource {
+        async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 {
+            0
+        }
+    }
+
+    /// One step in a randomized grant-lifecycle workload.
+    #[derive(Debug, Clone)]
+    enum Op {
+        /// Begin a worker start: spawn a task that acquires a grant of this many
+        /// bytes and then parks holding it, as a worker waits for its remaining
+        /// permits before becoming resident.
+        Start(u64),
+        /// A still-in-flight start becomes resident: its task yields the grant
+        /// guard, which we keep (the worker is now running).
+        Resident(prop::sample::Index),
+        /// Cancel a still-in-flight start, as deleting a waiting worker does:
+        /// abort the task, dropping the grant guard it held.
+        CancelStart(prop::sample::Index),
+        /// A resident worker exits: drop its grant guard.
+        Exit(prop::sample::Index),
+    }
+
+    fn arb_ops() -> impl Strategy<Value = Vec<Op>> {
+        prop::collection::vec(
+            prop_oneof![
+                4 => (1u64..50).prop_map(Op::Start),
+                2 => any::<prop::sample::Index>().prop_map(Op::Resident),
+                3 => any::<prop::sample::Index>().prop_map(Op::CancelStart),
+                2 => any::<prop::sample::Index>().prop_map(Op::Exit),
+            ],
+            1..80,
+        )
+    }
+
+    /// An in-flight start: the task runs admission, reports the outcome back over
+    /// `ready` (the grant on admit, `None` if the gate rejected it), then parks
+    /// holding the grant. The driver can take the grant (the worker became
+    /// resident) or abort the task (the start was cancelled, dropping any grant
+    /// inside the task). The outcome is always reported, so the driver never
+    /// blocks waiting on a start that was rejected.
+    struct InFlight {
+        handle: JoinHandle<()>,
+        ready: tokio::sync::oneshot::Receiver<Option<MemoryGrant>>,
+    }
+
+    /// Drive one randomized workload and assert headroom recovers to the ceiling
+    /// once every grant guard is gone.
+    async fn run_workload(limit: u64, ops: Vec<Op>) -> Result<(), TestCaseError> {
+        let controller = Arc::new(AdmissionController::new(
+            Box::new(ZeroUsageProbe { limit }),
+            AdmissionPolicy { usable_ratio: 1.0 },
+        ));
+
+        let mut in_flight: Vec<InFlight> = Vec::new();
+        let mut resident: Vec<MemoryGrant> = Vec::new();
+
+        for op in ops {
+            match op {
+                Op::Start(bytes) => {
+                    let controller = controller.clone();
+                    let (tx, rx) = tokio::sync::oneshot::channel();
+                    let handle = tokio::spawn(async move {
+                        // Always report the admission outcome so the driver never
+                        // blocks on a start that was rejected. On admit the grant
+                        // travels to the driver (held in the channel until taken
+                        // as resident or dropped on cancel); on reject we report
+                        // `None`.
+                        let outcome = controller.admit(bytes, &NoEvictionSource).await;
+                        let _ = tx.send(outcome);
+                        // Park so the task stays alive until the driver decides
+                        // its fate (become resident, or be aborted on cancel).
+                        std::future::pending::<()>().await;
+                    });
+                    in_flight.push(InFlight { handle, ready: rx });
+                }
+                Op::Resident(idx) => {
+                    if !in_flight.is_empty() {
+                        let i = idx.index(in_flight.len());
+                        let started = in_flight.remove(i);
+                        // Becoming resident requires the start to have been
+                        // admitted. Take the grant if there is one (worker is now
+                        // running); a rejected start cannot become resident and is
+                        // simply discarded. Either way abort the parked task.
+                        if let Ok(Some(grant)) = started.ready.await {
+                            resident.push(grant);
+                        }
+                        started.handle.abort();
+                        let _ = started.handle.await;
+                    }
+                }
+                Op::CancelStart(idx) => {
+                    if !in_flight.is_empty() {
+                        let i = idx.index(in_flight.len());
+                        let started = in_flight.remove(i);
+                        // Delete a waiting worker: abort the task and drop the
+                        // `InFlight`. Any grant the start acquired is held in
+                        // `started.ready`; dropping it returns the reservation,
+                        // exactly as aborting a waiting worker mid-flight does.
+                        started.handle.abort();
+                        let _ = started.handle.await;
+                        drop(started.ready);
+                    }
+                }
+                Op::Exit(idx) => {
+                    if !resident.is_empty() {
+                        let i = idx.index(resident.len());
+                        drop(resident.remove(i));
+                    }
+                }
+            }
+            // Let acquires/aborts settle so the granted accounting is observable.
+            for _ in 0..4 {
+                tokio::task::yield_now().await;
+            }
+        }
+
+        // Tear everything down: abort remaining starts, drop remaining resident
+        // grants. The environment is now empty.
+        for started in in_flight.drain(..) {
+            started.handle.abort();
+            let _ = started.handle.await;
+        }
+        resident.clear();
+        // Let the final drops' releases settle.
+        tokio::time::sleep(Duration::from_millis(20)).await;
+
+        let headroom = controller.headroom_bytes();
+        prop_assert_eq!(
+            headroom,
+            limit,
+            "headroom did not recover to ceiling {} after all grants were released (got {}); \
+             a grant leaked or was double-released across the lifecycle",
+            limit,
+            headroom
+        );
+
+        // And the gate must be live again: a fresh full-ceiling admission fits.
+        let readmit = controller.admit(limit, &NoEvictionSource).await;
+        prop_assert!(
+            readmit.is_some(),
+            "gate refused a full-ceiling admission after draining; headroom is wedged"
+        );
+        Ok(())
+    }
+
+    proptest! {
+        #![proptest_config(ProptestConfig { cases: 128, max_shrink_iters: 64, ..ProptestConfig::default() })]
+
+        /// Liveness: under any interleaving of start / become-resident /
+        /// cancel-start / exit, once every grant guard is gone the gate's
+        /// admissible headroom returns to the full ceiling and admits again. A
+        /// grant that leaks on cancellation (or is released twice) breaks this.
+        #[test]
+        fn grants_never_leak_under_random_churn(
+            limit in 200u64..4000,
+            ops in arb_ops(),
+        ) {
+            let rt = tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(4)
+                .enable_time()
+                .build()
+                .unwrap();
+
+            rt.block_on(async move {
+                tokio::time::timeout(Duration::from_secs(10), run_workload(limit, ops))
+                    .await
+                    .unwrap_or_else(|_| Err(TestCaseError::fail(
+                        "grant workload did not complete within the timeout",
+                    )))
+            })?;
+        }
+    }
+}
diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs
index 733d9529af..7e6ca1a298 100644
--- a/golem-worker-executor/src/services/golem_config.rs
+++ b/golem-worker-executor/src/services/golem_config.rs
@@ -73,6 +73,11 @@ pub struct GolemConfig {
     pub max_websocket_connections: usize,
     pub http_address: String,
     pub http_port: u16,
+    /// How often tokio runtime metrics are sampled from the runtime and pushed
+    /// into the metrics recorder exposed on `/metrics`. Prometheus scrapes the
+    /// rendered values independently; this is the in-process resolution.
+    #[serde(with = "humantime_serde")]
+    pub runtime_metrics_sampling_interval: Duration,
 }
 
 impl SafeDisplay for GolemConfig {
@@ -284,6 +289,7 @@ impl Default for GolemConfig {
             max_websocket_connections: 100,
             http_address: "0.0.0.0".to_string(),
             http_port: 8082,
+            runtime_metrics_sampling_interval: Duration::from_secs(5),
         }
     }
 }
@@ -963,28 +969,31 @@ pub struct MemoryConfig {
     pub system_memory_override: Option<u64>,
     pub worker_memory_ratio: f64,
     pub worker_estimate_coefficient: f64,
+    /// Multiplier applied to a component's `component_size` when reserving its
+    /// compiled-module memory with the admission gate, charged once per resident
+    /// component (shared across all its workers) rather than per worker.
+    pub component_size_coefficient: f64,
+    /// Whether the measured-headroom admission gate is active. Requires the
+    /// executor to own its memory environment (its own cgroup/process), as in a
+    /// production pod. Disable in shared environments — such as the in-process
+    /// test harness — where the probe cannot isolate this executor's footprint
+    /// from co-resident processes.
+    pub enable_measured_admission: bool,
     #[serde(with = "humantime_serde")]
     pub acquire_retry_delay: Duration,
     pub oom_retry_config: RetryConfig,
 }
 
 impl MemoryConfig {
-    pub fn total_system_memory(&self) -> u64 {
-        self.system_memory_override.unwrap_or_else(|| {
-            let mut sysinfo = sysinfo::System::new();
-            sysinfo.refresh_memory();
-            sysinfo.total_memory()
-        })
-    }
-
-    pub fn system_memory(&self) -> u64 {
-        let mut sysinfo = sysinfo::System::new();
-        sysinfo.refresh_memory();
-        sysinfo.available_memory()
-    }
-
-    pub fn worker_memory(&self) -> usize {
-        (self.total_system_memory() as f64 * self.worker_memory_ratio) as usize
+    /// The admission policy for the measured-headroom gate. Reuses
+    /// `worker_memory_ratio` as the usable fraction of the measured limit (the
+    /// host keeps the remainder).
+    pub(crate) fn admission_policy(
+        &self,
+    ) -> crate::services::active_workers::admission::AdmissionPolicy {
+        crate::services::active_workers::admission::AdmissionPolicy {
+            usable_ratio: self.worker_memory_ratio,
+        }
     }
 }
 
@@ -1004,6 +1013,16 @@ impl SafeDisplay for MemoryConfig {
             "worker estimate coefficient: {}",
             self.worker_estimate_coefficient
         );
+        let _ = writeln!(
+            &mut result,
+            "component size coefficient: {}",
+            self.component_size_coefficient
+        );
+        let _ = writeln!(
+            &mut result,
+            "measured admission enabled: {}",
+            self.enable_measured_admission
+        );
         let _ = writeln!(
             &mut result,
             "acquire retry delay: {:?}",
@@ -1528,6 +1547,8 @@ impl Default for MemoryConfig {
             system_memory_override: None,
             worker_memory_ratio: 0.8,
             worker_estimate_coefficient: 1.1,
+            component_size_coefficient: 2.0,
+            enable_measured_admission: true,
             acquire_retry_delay: Duration::from_millis(500),
             oom_retry_config: RetryConfig {
                 max_attempts: u32::MAX,
diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs
index 1e6d4fa7cc..23ba710f81 100644
--- a/golem-worker-executor/src/worker/mod.rs
+++ b/golem-worker-executor/src/worker/mod.rs
@@ -27,7 +27,8 @@ use crate::durable_host::recover_stderr_logs;
 use crate::metrics::storage::record_filesystem_pool_released;
 use crate::model::{AgentConfig, ExecutionStatus, LookupResult, ReadFileResult, TrapType};
 use crate::services::active_workers::{
-    FilesystemStoragePermit, RegisteredConcurrentAccount, WorkerMemoryPermit,
+    FilesystemStoragePermit, HeldComponentCharge, MemoryGrant, RegisteredConcurrentAccount,
+    WorkerComponentCharge,
 };
 use crate::services::events::{Event, EventsSubscription};
 use crate::services::golem_config::SnapshotPolicy;
@@ -58,6 +59,7 @@ use golem_common::model::agent::{
     AgentMode, ParsedAgentId, Principal, Snapshotting, SnapshottingConfig,
 };
 use golem_common::model::component::CanonicalFilePath;
+use golem_common::model::component::ComponentId;
 use golem_common::model::component::ComponentRevision;
 use golem_common::model::invocation_context::InvocationContextStack;
 use golem_common::model::oplog::{OplogEntry, OplogIndex, UpdateDescription};
@@ -410,6 +412,12 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             WorkerInstance::Unloaded { .. } => {
                 this.mark_as_loading();
                 crate::metrics::workers::inc_worker_waiting_for_memory();
+                crate::metrics::wasm::record_worker_resident_linear_memory(
+                    this.get_latest_worker_metadata()
+                        .await
+                        .last_known_status
+                        .total_linear_memory_size,
+                );
                 *instance_guard = WorkerInstance::WaitingForPermit(WaitingWorker::new(
                     this.clone(),
                     this.memory_requirement().await?,
@@ -789,15 +797,29 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
         self.execution_status.read().unwrap().agent_mode()
     }
 
-    /// Gets the estimated memory requirement of the worker
+    /// Gets the estimated memory requirement of the worker.
+    ///
+    /// This covers only the per-worker linear memory. The compiled component
+    /// module is shared by all workers of a component and is charged once per
+    /// resident component via the component-charge registry, not per worker.
     pub async fn memory_requirement(&self) -> Result<u64, WorkerExecutorError> {
         let metadata = self.get_latest_worker_metadata().await;
 
-        let ml = metadata.last_known_status.total_linear_memory_size as f64;
-        let sw = metadata.last_known_status.component_size as f64;
-        let c = 2.0;
-        let x = self.worker_estimate_coefficient;
-        Ok((x * (ml + c * sw)) as u64)
+        let linear_memory_bytes = metadata.last_known_status.total_linear_memory_size as f64;
+        let estimate_coefficient = self.worker_estimate_coefficient;
+        Ok((estimate_coefficient * linear_memory_bytes) as u64)
+    }
+
+    /// Returns the component identity and compiled-module size used to charge
+    /// the shared module memory once per resident component.
+    pub async fn component_charge_requirement(
+        &self,
+    ) -> Result<(ComponentId, ComponentRevision, u64), WorkerExecutorError> {
+        let metadata = self.get_latest_worker_metadata().await;
+        let component_id = self.owned_agent_id.component_id();
+        let component_revision = metadata.last_known_status.component_revision;
+        let component_module_bytes = metadata.last_known_status.component_size;
+        Ok((component_id, component_revision, component_module_bytes))
     }
 
     /// Gets the storage requirement of the worker based on the last known status.
@@ -963,20 +985,39 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
 
     // Should only be called from invocation loop
     pub async fn increase_memory(&self, delta: u64) -> anyhow::Result<()> {
+        // The instance lock must not be held while running the admission gate:
+        // it may run the eviction scan, which takes other workers' instance
+        // locks. Holding this worker's instance lock across that scan while
+        // another growing worker does the same is an AB-BA deadlock. So check the
+        // state, release the lock, then run the gate.
+        match &*self.instance.lock().await {
+            WorkerInstance::Running(_) => {}
+            WorkerInstance::Stopping(_)
+            | WorkerInstance::WaitingForPermit(_)
+            | WorkerInstance::Unloaded { .. }
+            | WorkerInstance::Deleting => return Ok(()),
+        }
+
+        let Some(extra_grant) = self.active_workers().try_acquire(delta).await else {
+            crate::metrics::workers::record_worker_memory_grow_rejected();
+            return Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory));
+        };
+
+        // Re-check state under the lock: the worker may have changed state while
+        // the gate ran. If it is still running, merge the extra grant into the
+        // running worker so its whole reservation releases together on unload.
+        // Otherwise drop `extra_grant` here, returning the reservation to the
+        // gate, and treat the grow as a no-op (matching the non-running arms).
         match &mut *self.instance.lock().await {
             WorkerInstance::Running(running) => {
-                if let Some(new_permits) = self.active_workers().try_acquire(delta).await {
-                    running.merge_extra_permits(new_permits);
-                    Ok(())
-                } else {
-                    Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory))
-                }
+                running.merge_extra_memory_grant(extra_grant);
             }
-            WorkerInstance::Stopping(_) => Ok(()),
-            WorkerInstance::WaitingForPermit(_) => Ok(()),
-            WorkerInstance::Unloaded { .. } => Ok(()),
-            WorkerInstance::Deleting => Ok(()),
+            WorkerInstance::Stopping(_)
+            | WorkerInstance::WaitingForPermit(_)
+            | WorkerInstance::Unloaded { .. }
+            | WorkerInstance::Deleting => {}
         }
+        Ok(())
     }
 
     /// Return `freed_bytes` to the storage semaphore pool.
@@ -1627,11 +1668,15 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
                 // when stopping via the invocation loop we can stop immediately, no need to go via the stopping status
                 if called_from_invocation_loop {
                     crate::metrics::workers::dec_worker_memory_resident();
+                    // Dropping `running` at the end of this arm releases its
+                    // memory grant (and component/storage permits) back to the
+                    // gate.
                     **instance_guard = final_state.into_instance();
                     StopResult::Stopped
                 } else {
                     // drop the running worker, this signals to the invocation loop to start exiting.
-                    // RunningWorker::drop releases the memory permit, so dec resident here.
+                    // `stop()` consumes the RunningWorker and drops everything but
+                    // its join handle, releasing its memory grant back to the gate.
                     let run_loop_handle = running.stop();
                     let notify = OneShotEvent::new();
                     crate::metrics::workers::dec_worker_memory_resident();
@@ -2183,7 +2228,8 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
 
     async fn start_waiting_worker(
         this: Arc<Worker<Ctx>>,
-        permit: WorkerMemoryPermit,
+        memory_grant: MemoryGrant,
+        component_charge: WorkerComponentCharge,
         filesystem_storage_permit: Option<FilesystemStoragePermit>,
         concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit,
         oom_retry_count: u32,
@@ -2198,7 +2244,8 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
                     this.owned_agent_id.clone(),
                     this.queue.clone(),
                     this.clone(),
-                    permit,
+                    memory_grant,
+                    component_charge,
                     concurrent_agent_permit,
                     oom_retry_count,
                 )
@@ -2212,6 +2259,8 @@ impl<Ctx: WorkerCtx> Worker<Ctx> {
             }
             _ => {
                 debug!("worker was not waiting for permit anymore, not starting");
+                // The worker is not becoming resident: dropping `memory_grant`
+                // here returns its reservation to the gate.
             }
         }
     }
@@ -2349,10 +2398,41 @@ impl WaitingWorker {
                 let agent_id = parent.owned_agent_id.agent_id();
                 let registered_concurrent_account = parent.registered_concurrent_account.clone();
                 let concurrent_agent_permit = registered_concurrent_account.acquire(agent_id).await;
-                // Do not reserve executor memory while waiting for a per-account
-                // concurrency slot. Otherwise one account could fill the memory
-                // pool with workers that are not allowed to run yet.
-                let permit = parent.active_workers().acquire(memory_requirement).await;
+                // Do not gate executor memory while waiting for a per-account
+                // concurrency slot. Otherwise one account could exhaust the
+                // memory headroom with workers that are not allowed to run yet.
+                //
+                // `memory_grant` owns the reservation from here on: it is held as
+                // a local until the worker becomes resident (when it moves into
+                // the RunningWorker) or this task ends/aborts (when dropping it
+                // returns the reservation to the gate). This is what makes a
+                // start cancelled mid-flight — e.g. the worker being deleted while
+                // still waiting for its remaining permits — release rather than
+                // leak its grant.
+                let memory_grant = parent.active_workers().acquire(memory_requirement).await;
+                // Reserve the component's compiled module size once per resident
+                // component (shared by all its workers). Held for as long as this
+                // worker is resident; the module faults into RAM when the first
+                // worker loads, so reserving it keeps later admissions honest.
+                let component_charge = match parent.component_charge_requirement().await {
+                    Ok((component_id, component_revision, component_module_bytes)) => {
+                        parent
+                            .active_workers()
+                            .acquire_component_charge(
+                                component_id,
+                                component_revision,
+                                component_module_bytes,
+                            )
+                            .await
+                    }
+                    Err(err) => {
+                        warn!(
+                            "Failed to determine component charge requirement, not starting: {err}"
+                        );
+                        // Dropping `memory_grant` here returns its reservation.
+                        return;
+                    }
+                };
                 // Pre-acquire storage permits for this restart.
                 //
                 // We need to acquire `filesystem_storage_requirement + desired_extra` total:
@@ -2403,7 +2483,8 @@ impl WaitingWorker {
                 debug!("Attempting to start worker after acquiring enough permits");
                 Worker::start_waiting_worker(
                     parent,
-                    permit,
+                    memory_grant,
+                    component_charge,
                     filesystem_storage_permit,
                     concurrent_agent_permit,
                     oom_retry_count,
@@ -2435,7 +2516,18 @@ struct RunningWorker {
     handle: Option<JoinHandle<()>>,
     sender: UnboundedSender<WorkerCommand>,
     queue: Arc<RwLock<VecDeque<QueuedWorkerInvocation>>>,
-    permit: WorkerMemoryPermit,
+    /// The worker's memory reservation with the admission gate, covering its
+    /// initial requirement plus any grow deltas merged in. Held only to be
+    /// dropped: dropping it (on stop, eviction, or this worker being dropped for
+    /// any reason) returns the reservation to the gate, keeping the granted total
+    /// symmetric with what was reserved.
+    #[allow(dead_code)]
+    memory_grant: MemoryGrant,
+    /// Keeps this worker's component module charge alive while it is resident.
+    /// Held only to be dropped: dropping it releases the component's residency
+    /// (and the module reservation if this was the last worker of the component).
+    #[allow(dead_code)]
+    component_charge: Box<dyn HeldComponentCharge>,
     /// Storage semaphore permits held by this worker. `None` until storage
     /// space is first acquired (at startup or on first write). Dropped
     /// automatically when `RunningWorker` is dropped, returning storage
@@ -2466,7 +2558,8 @@ impl RunningWorker {
         owned_agent_id: OwnedAgentId,
         queue: Arc<RwLock<VecDeque<QueuedWorkerInvocation>>>,
         parent: Arc<Worker<Ctx>>,
-        permit: WorkerMemoryPermit,
+        memory_grant: MemoryGrant,
+        component_charge: WorkerComponentCharge,
         concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit,
         oom_retry_count: u32,
     ) -> Self {
@@ -2516,7 +2609,8 @@ impl RunningWorker {
             handle: Some(handle),
             sender,
             queue,
-            permit,
+            memory_grant,
+            component_charge: Box::new(component_charge),
             filesystem_storage_permit: None,
             waiting_for_command,
             interrupt_signal,
@@ -2524,8 +2618,11 @@ impl RunningWorker {
         }
     }
 
-    pub fn merge_extra_permits(&mut self, extra_permit: WorkerMemoryPermit) {
-        self.permit.merge(extra_permit);
+    /// Merge an additional memory grant (from a successful grow) into this
+    /// worker's grant, so its whole reservation is released together when the
+    /// worker unloads.
+    pub fn merge_extra_memory_grant(&mut self, extra: MemoryGrant) {
+        self.memory_grant.merge(extra);
     }
 
     /// Merge additional storage permits into this worker's storage permit. If
diff --git a/golem-worker-executor/tests/resource_limits.rs b/golem-worker-executor/tests/resource_limits.rs
index 58377cba3b..a816beb39e 100644
--- a/golem-worker-executor/tests/resource_limits.rs
+++ b/golem-worker-executor/tests/resource_limits.rs
@@ -186,11 +186,14 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish(
     let context = TestContext::new(last_unique_id);
     let executor = start_with_concurrent_agent_limit(deps, &context, 1).await?;
 
-    // HTTP server that gates its /poll response behind a Notify.
+    // HTTP server that gates its /poll response behind a zero-permit semaphore.
     // HttpClient2.start_polling polls GET /poll until the body equals "done".
-    // By holding the Notify unreleased we keep a1 in the Running state
-    // for as long as needed, preventing eviction and holding the only permit.
-    let gate = std::sync::Arc::new(tokio::sync::Notify::new());
+    // The handler blocks acquiring a permit, so by withholding the permit we keep
+    // a1 in the Running state for as long as needed, preventing eviction and
+    // holding the only permit. A semaphore is used rather than a Notify so the
+    // release is not sensitive to whether the request's waiter is registered
+    // before the release call.
+    let gate = std::sync::Arc::new(tokio::sync::Semaphore::new(0));
     let gate_clone = gate.clone();
     let listener = tokio::net::TcpListener::bind("0.0.0.0:0").await?;
     let port = listener.local_addr()?.port();
@@ -200,7 +203,10 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish(
             get(move || {
                 let gate = gate_clone.clone();
                 async move {
-                    gate.notified().await;
+                    gate.acquire()
+                        .await
+                        .expect("gate semaphore closed")
+                        .forget();
                     "done".to_string()
                 }
             }),
@@ -259,7 +265,7 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish(
     // Release the gate — a1's poll loop returns "done", its invocation
     // completes, and its permit is returned to the semaphore via Drop.
     // This unblocks a2 from WaitingForPermit.
-    gate.notify_waiters();
+    gate.add_permits(1);
 
     // Wait for a1 to become Idle (invocation done, permit released).
     executor
@@ -320,7 +326,13 @@ async fn concurrent_agent_idle_releases_permit(
     let executor = start_with_concurrent_agent_limit(deps, &context, 1).await?;
 
     // --- HTTP gate: keeps a1 provably Running until we release it. ---
-    let gate = std::sync::Arc::new(tokio::sync::Notify::new());
+    // A zero-permit semaphore is used rather than a Notify so the release is not
+    // sensitive to whether the request's waiter is registered before the release
+    // call: a permit added before the handler reaches `acquire` is simply waiting
+    // for it. The handler blocks on `acquire` and only returns once the test adds
+    // a permit, so a1 stays Running (blocked in /poll) until then regardless of
+    // how the runner schedules the tasks.
+    let gate = std::sync::Arc::new(tokio::sync::Semaphore::new(0));
     let gate_clone = gate.clone();
     let listener = tokio::net::TcpListener::bind("0.0.0.0:0").await?;
     let port = listener.local_addr()?.port();
@@ -330,7 +342,12 @@ async fn concurrent_agent_idle_releases_permit(
             get(move || {
                 let gate = gate_clone.clone();
                 async move {
-                    gate.notified().await;
+                    // Consume one permit permanently so a single added permit
+                    // releases exactly one poll, not a recycled one.
+                    gate.acquire()
+                        .await
+                        .expect("gate semaphore closed")
+                        .forget();
                     "done".to_string()
                 }
             }),
@@ -387,7 +404,7 @@ async fn concurrent_agent_idle_releases_permit(
     // Release the gate. a1's poll returns "done", invocation completes, a1 goes Idle.
     // With the fix: Idle transition drops the permit → semaphore notifies a2 → a2 starts.
     // With the bug: a1 stays Idle but holds permit → a2 remains blocked forever.
-    gate.notify_waiters();
+    gate.add_permits(1);
 
     // a2 should now be unblocked (fix) or remain stuck (bug).
     // Give it 15 seconds — well beyond what starting a counter agent takes.
diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
new file mode 100644
index 0000000000..1d7a477661
--- /dev/null
+++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml
@@ -0,0 +1,68 @@
+# Cloud throughput-saturation benchmark suite.
+#
+# Unlike cloud-perf's throughput benchmarks (which keep `size` small enough that
+# all workers fit in memory), this suite deliberately ramps the number of
+# active, memory-holding agents up to and past the executor's memory ceiling to
+# find the per-pod active-agent capacity and the throughput sustained once
+# memory is exhausted.
+#
+# Each agent retains a deterministic, per-agent-distinct amount of resident
+# memory, so the fleet presents a mix of footprints near the limit (exercising
+# the admission/eviction path). The measured phase drives one in-flight
+# `busy_for` call per agent and records aggregate throughput.
+#
+# Run with the benchmarks binary's `cloud` subcommand (same flags as cloud-perf):
+#
+#   benchmarks suite integration-tests/benchmark_suites/cloud-density-saturation.yaml \
+#     --save-to-json result.json \
+#     cloud --api-url https://<host> --apps-base-domain <domain> \
+#       --admin-account-token <token> --builtin-plugin-owner-account-id <uuid> \
+#       --default-plan-id <uuid> --component-directory <path-to-wasm-components>
+#
+# Reading the result: plot `saturation-throughput-ops-per-sec` and
+# invocation-retries/timeouts against `size`. Throughput climbs with `size`
+# until the pod's memory is exhausted, then plateaus or drops while retries and
+# eviction churn rise — that knee is the active-agent ceiling.
+#
+# `clusterSize` is ignored in cloud mode (single observed cluster).
+
+name: cloud-density-saturation
+benchmarks:
+  # # Rust echo agents — lean per-instance linear memory (the ~900 KB module is
+  # # charged once per component, shared across all agents; what scales per agent
+  # # is the small instance heap). The previous run reached the top of the sweep
+  # # (12000) without saturating pod memory, so the knee here is throughput /
+  # # eviction-churn rather than memory. Dropped the low points that told us
+  # # nothing and pushed the range up with coarser steps.
+  # - name: throughput-saturation-echo-rust
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [2000, 3000, 4000, 5000, 10000, 15000, 20000]
+  #   length: [0]
+
+  # # TypeScript echo agents — each instance instantiates its own QuickJS runtime
+  # # and JS heap in its own linear memory (the 17.4 MB module is shared once per
+  # # component; the per-instance runtime state is the heavy per-agent cost).
+  # # Heavier per agent than the Rust variant, so a lower knee — but the previous
+  # # run reached 2000 without saturating, so push higher and drop the low points.
+  # - name: throughput-saturation-echo-ts
+  #   iterations: 3
+  #   clusterSize: [2]
+  #   size: [1000, 2000, 3000]
+  #   length: [0]
+
+  # Synthetic footprint — each agent retains a deterministic per-agent-distinct
+  # amount of resident memory, exercising the admission/eviction path with a
+  # controllable footprint near the limit. Run first: this is the variant that
+  # actually fills memory and drives the gate to its reject/evict path.
+  # size   = number of active, memory-holding agents (the ramp axis)
+  # length = base per-agent memory footprint in bytes; each agent retains a
+  #          deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base =>
+  #          ~72 MiB average per agent, filling a ~10 GiB usable pool around
+  #          ~145 agents. The sweep brackets that ceiling and pushes well past it
+  #          so the admission gate's reject/evict behaviour near OOM is exercised.
+  - name: throughput-saturation-counters
+    iterations: 1
+    clusterSize: [2]
+    size: [50, 100, 150, 200, 300]
+    length: [16777216]
diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml
new file mode 100644
index 0000000000..ef8dd7d61f
--- /dev/null
+++ b/integration-tests/benchmark_suites/cloud-perf.yaml
@@ -0,0 +1,130 @@
+# Cloud-perf benchmark suite — runs the full benchmark suite against a
+# deployed Golem environment via Gateway-API hostnames (TestMode::Cloud).
+#
+# Run with the benchmarks binary's `cloud` subcommand:
+#
+#   benchmarks suite integration-tests/benchmark_suites/cloud-perf.yaml \
+#     --save-to-json result.json \
+#     cloud \
+#       --api-url https://<your-golem-api-host> \
+#       --apps-base-domain <your-apps-base-domain> \
+#       --admin-account-token <token> \
+#       --builtin-plugin-owner-account-id <uuid> \
+#       --default-plan-id <uuid> \
+#       --component-directory <path-to-wasm-components>
+#
+# Note: clusterSize is ignored in Cloud mode (the observed cluster size is
+# read from shard-manager at run start and recorded in result metadata).
+#
+# Suite order rationale: throughput benchmarks run first because they involve
+# RPC worker pairs and HTTP deployments — the most complex setup. Running them
+# early surfaces infrastructure issues (stuck workers, port-forward drops)
+# before spending time on the simpler benchmarks.
+
+name: cloud-perf
+benchmarks:
+  # Throughput — measures invocation throughput across six implementations:
+  # rust agent (gRPC), TS agent (gRPC), rust agent (HTTP), TS agent (HTTP),
+  # TS RPC pair, rust RPC pair.
+  # size   = number of workers per implementation (×6 implementations total)
+  # length = unused for echo
+  - name: throughput-echo
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 50, 100, 250]
+    length: [1000]
+
+  # size   = number of workers per implementation
+  # length = payload size in bytes sent to large_input
+  # NOTE: large payloads grow worker linear memory, so this is the throughput
+  # benchmark most relevant to the memory-admission investigation — sized to
+  # match throughput-echo so it exercises real density.
+  - name: throughput-large-input
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 50, 100, 250]
+    length: [100, 10000]
+
+  # size   = number of workers per implementation
+  # length = CPU work length passed to cpu_intensive
+  - name: throughput-cpu-intensive
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 50, 100, 250]
+    length: [100]
+
+  # Cold-start: compilation cache disabled — measures true cold-start latency
+  # with no warm compiled artefact available.
+  # size   = number of unique components created (each in its own env)
+  # length = seconds to wait per component for pre-compilation warm-up
+  - name: cold-start-unknown-small
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 25, 50]
+    length: [2]
+    disableCompilationCache: true
+
+  - name: cold-start-unknown-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 25, 50]
+    length: [5]
+    disableCompilationCache: true
+
+  # Cold-start: compilation cache enabled — measures latency once the compiled
+  # artefact is available in the cache.
+  # size   = number of unique components created (each in its own env)
+  # length = seconds to wait per component for pre-compilation warm-up
+  # NOTE: if results here are close to the cache-disabled entries above, the
+  # warm-up wait is too short and compilation hasn't finished — bump length.
+  - name: cold-start-unknown-small
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 25, 50]
+    length: [2]
+
+  - name: cold-start-unknown-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [1, 5, 25, 50]
+    length: [5]
+
+  # Invocation latency — hot and cold paths through the Gateway NLB.
+  # Large worker counts to stress the load balancer and connection pool.
+  # size   = number of workers created
+  # length = number of hot invocations per worker after the first cold one
+  - name: latency-small
+    iterations: 3
+    clusterSize: [2]
+    size: [100, 500, 1000, 2000, 5000]
+    length: [2]
+
+  - name: latency-medium
+    iterations: 3
+    clusterSize: [2]
+    size: [100, 500, 1000, 2000]
+    length: [5]
+
+  # Sleep — measures worker suspension and resumption under real network
+  # conditions. High residency: all `size` workers held in memory sleeping at
+  # once, so this also probes how many resident workers fit (memory-admission
+  # relevant) — pushed past the ~2000 echo proved out.
+  # size   = number of workers launched in parallel
+  # length = sleep duration in milliseconds
+  - name: sleep
+    iterations: 3
+    clusterSize: [2]
+    size: [10, 100, 500, 1000, 2000]
+    length: [10000]
+
+  # Durability overhead — measures the cost of durable vs ephemeral execution
+  # across four variants (durable-persistent, durable-non-persistent,
+  # ephemeral, durable-persistent-commit). size workers concurrent per phase;
+  # sized up to put real load on the oplog/persistence/storage path.
+  # size   = number of workers per variant
+  # length = loop iteration count passed to oplog_heavy
+  - name: durability-overhead
+    iterations: 3
+    clusterSize: [2]
+    size: [10, 50, 100, 200]
+    length: [5000]
diff --git a/integration-tests/src/benchmarks/all.rs b/integration-tests/src/benchmarks/all.rs
index 91d972534d..e79ac78612 100644
--- a/integration-tests/src/benchmarks/all.rs
+++ b/integration-tests/src/benchmarks/all.rs
@@ -13,16 +13,28 @@
 // limitations under the License.
 
 use clap::Parser;
+use golem_client::api::RegistryServiceClient;
+use golem_common::base_model::agent::ParsedAgentId;
+use golem_common::model::AgentId;
+use golem_common::model::application::{ApplicationCreation, ApplicationName};
+use golem_common::model::environment::{EnvironmentCreation, EnvironmentName};
+use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{
     Benchmark, BenchmarkApi, BenchmarkConfig, BenchmarkResult, BenchmarkSuite, BenchmarkSuiteItem,
-    BenchmarkSuiteResult,
+    BenchmarkSuiteResult, RunMetadata,
+};
+use golem_test_framework::config::benchmark::{TestMode, cloud_bench_run_id};
+use golem_test_framework::config::{
+    BenchmarkCliParameters, BenchmarkTestDependencies, TestDependencies,
+};
+use golem_test_framework::dsl::{TestDsl, TestDslExtended};
+use integration_tests::benchmarks::{
+    cleanup_account, cleanup_user_state, delete_workers, invoke_and_await_agent,
 };
-use golem_test_framework::config::benchmark::TestMode;
-use golem_test_framework::config::{BenchmarkCliParameters, BenchmarkTestDependencies};
 use std::collections::BTreeMap;
 use std::future::Future;
 use std::pin::Pin;
-use tracing::{Level, debug, info};
+use tracing::{Level, debug, info, warn};
 
 type RunFn = Box<
     dyn for<'a> Fn(
@@ -121,6 +133,30 @@ async fn main() {
             >(mode, verbosity, item, primary_only, otlp))
         }),
     );
+    benchmarks_by_name.insert(
+        "throughput-saturation-counters",
+        Box::new(|mode, verbosity, item, primary_only, otlp| {
+            Box::pin(run_benchmark::<
+                integration_tests::benchmarks::throughput_saturation::ThroughputSaturationCounters,
+            >(mode, verbosity, item, primary_only, otlp))
+        }),
+    );
+    benchmarks_by_name.insert(
+        "throughput-saturation-echo-rust",
+        Box::new(|mode, verbosity, item, primary_only, otlp| {
+            Box::pin(run_benchmark::<
+                integration_tests::benchmarks::throughput_saturation::ThroughputSaturationEchoRust,
+            >(mode, verbosity, item, primary_only, otlp))
+        }),
+    );
+    benchmarks_by_name.insert(
+        "throughput-saturation-echo-ts",
+        Box::new(|mode, verbosity, item, primary_only, otlp| {
+            Box::pin(run_benchmark::<
+                integration_tests::benchmarks::throughput_saturation::ThroughputSaturationEchoTs,
+            >(mode, verbosity, item, primary_only, otlp))
+        }),
+    );
 
     let params = BenchmarkCliParameters::parse_from(std::env::args_os());
     let tracer_provider = BenchmarkTestDependencies::init_logging(&params);
@@ -144,7 +180,14 @@ async fn main() {
                     length: length.clone(),
                     disable_compilation_cache: Some(*disable_compilation_cache),
                 };
-                let result = f(
+
+                cloud_preflight_warmup(
+                    params.benchmark_config.mode(),
+                    params.service_verbosity(),
+                    params.otlp,
+                )
+                .await;
+                let mut result = f(
                     params.benchmark_config.mode(),
                     params.service_verbosity(),
                     &item,
@@ -152,6 +195,10 @@ async fn main() {
                     params.otlp,
                 )
                 .await;
+                // Attach the run_id to result metadata (cloud mode only).
+                if let Some(run_id) = cloud_bench_run_id() {
+                    result.run_id = Some(format!("bench-{run_id}"));
+                }
                 if params.json {
                     let str = serde_json::to_string(&result)
                         .expect("Failed to serialize BenchmarkResult");
@@ -174,9 +221,27 @@ async fn main() {
             let suite: BenchmarkSuite =
                 serde_yaml::from_str(&raw_suite).expect("Failed to parse benchmark suite");
 
+            // Validate every benchmark name up-front so a typo exits immediately
+            // without running warmup or any prior benchmark.
+            for benchmark in &suite.benchmarks {
+                if !benchmarks_by_name.contains_key(benchmark.name.as_str()) {
+                    print_non_existing_benchmark(&mut benchmarks_by_name, &benchmark.name);
+                    // print_non_existing_benchmark calls std::process::exit(1)
+                    unreachable!();
+                }
+            }
+
+            // Pre-flight warmup runs after all names are validated.
+            cloud_preflight_warmup(
+                params.benchmark_config.mode(),
+                params.service_verbosity(),
+                params.otlp,
+            )
+            .await;
+
             let mut suite_result = BenchmarkSuiteResult::new(&suite.name);
             for benchmark in suite.benchmarks {
-                info!("Running {benchmark:?}"); // TODO
+                info!("Running {benchmark:?}");
 
                 if let Some(f) = benchmarks_by_name.get(benchmark.name.as_str()) {
                     let result = f(
@@ -188,8 +253,19 @@ async fn main() {
                     )
                     .await;
                     suite_result.add(result);
-                } else {
-                    print_non_existing_benchmark(&mut benchmarks_by_name, &benchmark.name);
+                }
+                // no else: we already validated all names above
+            }
+
+            // Attach the run_id and run_metadata to result metadata (cloud mode only).
+            if let Some(run_id) = cloud_bench_run_id() {
+                suite_result.run_id = Some(format!("bench-{run_id}"));
+
+                // Read GOLEM_BENCH_* env vars set by the buildspec before invoking
+                // the binary. Missing vars produce None rather than failing the run.
+                let metadata = RunMetadata::from_env();
+                if !metadata.is_empty() {
+                    suite_result.run_metadata = Some(metadata);
                 }
             }
 
@@ -241,3 +317,164 @@ async fn run_benchmark<B: Benchmark>(
 ) -> BenchmarkResult {
     B::run_benchmark(mode, verbosity, item, primary_only, otlp).await
 }
+
+// ── Pre-flight warmup constants ───────────────────────────────────────────────
+
+/// WASM file name (without `.wasm`) of the component used for warmup
+/// invocations.  Must be present in `--component-directory`.
+const WARMUP_COMPONENT_WASM: &str = "benchmark_agent_rust_release";
+/// Registry display name for the warmup component.
+const WARMUP_COMPONENT_NAME: &str = "benchmark:agent-rust";
+/// Agent type whose `echo` method is invoked during warmup.
+const WARMUP_AGENT_TYPE: &str = "RustBenchmarkAgent";
+/// Instance ID of the throwaway warmup agent.
+const WARMUP_AGENT_INSTANCE: &str = "warmup";
+/// Total wall-clock budget for the 50 warmup invocations.  If the budget
+/// fires (e.g. the platform is slow to cold-start on the first invocation)
+/// a warning is logged and the benchmark continues — warmup is best-effort.
+const WARMUP_BUDGET: std::time::Duration = std::time::Duration::from_secs(180);
+
+/// Pre-flight warmup for cloud mode. Runs once at suite/benchmark start;
+/// is a no-op for all non-cloud modes.
+///
+/// Executes 50 throwaway `invoke_and_await_agent` calls against a short-lived
+/// user/env/component. Each call exercises the full stack:
+/// gateway → registry-service (component lookup) → worker-service
+/// → worker-executor, warming NLB target-group routing and HTTP/2 sessions at
+/// every hop so they don't contaminate the first measured iteration.
+///
+/// The entire invocation phase is bounded by a 3-minute timeout. If the
+/// timeout fires (e.g. because of a gateway routing issue on the first cold
+/// start), a warning is logged and the benchmark continues — warm-up is
+/// best-effort.
+///
+/// If uploading the warmup component fails (e.g. the file is absent from the
+/// component directory), a warning is logged and the agent-invocation phase
+/// is skipped; the throwaway account is still cleaned up.
+async fn cloud_preflight_warmup(mode: &TestMode, verbosity: Level, otlp: bool) {
+    if !matches!(mode, TestMode::Cloud { .. }) {
+        return;
+    }
+
+    info!("Pre-flight warmup: creating throwaway user/env/component (50 invocations)...");
+
+    let deps = BenchmarkTestDependencies::new(mode, verbosity, 0, false, otlp).await;
+
+    let user = match deps.user().await {
+        Ok(u) => u,
+        Err(e) => {
+            warn!("Pre-flight warmup: failed to create user (skipping): {e:?}");
+            deps.kill_all().await;
+            return;
+        }
+    };
+
+    let registry_client = user.registry_service_client().await;
+    let prefix = user.deps.bench_name_prefix().unwrap_or_default();
+
+    let app = match registry_client
+        .create_application(
+            &user.account_id.0,
+            &ApplicationCreation {
+                name: ApplicationName(format!("{prefix}app-warmup")),
+            },
+        )
+        .await
+    {
+        Ok(a) => a,
+        Err(e) => {
+            warn!("Pre-flight warmup: failed to create app (skipping): {e:?}");
+            cleanup_account(&user).await;
+            deps.kill_all().await;
+            return;
+        }
+    };
+
+    let env = match registry_client
+        .create_environment(
+            &app.id.0,
+            &EnvironmentCreation {
+                name: EnvironmentName(format!("{prefix}env-warmup")),
+                compatibility_check: false,
+                version_check: false,
+                security_overrides: false,
+            },
+        )
+        .await
+    {
+        Ok(e) => e,
+        Err(e) => {
+            warn!("Pre-flight warmup: failed to create env (skipping): {e:?}");
+            // delete app explicitly before account (cascading delete is incomplete)
+            if let Err(del_err) = registry_client
+                .delete_application(&app.id.0, app.revision.into())
+                .await
+            {
+                warn!(
+                    "Pre-flight warmup: failed to delete app {} after env-creation \
+                     failure (best-effort, app may be orphaned): {del_err:?}",
+                    app.id.0
+                );
+            }
+            cleanup_account(&user).await;
+            deps.kill_all().await;
+            return;
+        }
+    };
+
+    let component = match user
+        .component(&env.id, WARMUP_COMPONENT_WASM)
+        .name(WARMUP_COMPONENT_NAME)
+        .store()
+        .await
+    {
+        Ok(c) => c,
+        Err(e) => {
+            warn!(
+                "Pre-flight warmup: failed to upload warmup component \
+                 ({WARMUP_COMPONENT_WASM}.wasm) — ensure it exists in the \
+                 component directory: {e:?}"
+            );
+            cleanup_user_state(&user, &env.id).await;
+            deps.kill_all().await;
+            return;
+        }
+    };
+
+    let warmup_agent: ParsedAgentId = agent_id!(WARMUP_AGENT_TYPE, WARMUP_AGENT_INSTANCE);
+
+    // Bound the 50 invocations with a total wall-clock budget.
+    let invoke_result = tokio::time::timeout(WARMUP_BUDGET, async {
+        for i in 0..50usize {
+            let result = invoke_and_await_agent(
+                &user,
+                &component,
+                &warmup_agent,
+                "echo",
+                data_value!("warmup"),
+            )
+            .await;
+            info!(
+                "Pre-flight warmup invocation {}/50: {}ms",
+                i + 1,
+                result.accumulated_time.as_millis()
+            );
+        }
+    })
+    .await;
+
+    if invoke_result.is_err() {
+        warn!(
+            "Pre-flight warmup: invocation phase timed out after {}s (continuing anyway)",
+            WARMUP_BUDGET.as_secs()
+        );
+    }
+
+    if let Ok(worker_id) = AgentId::from_agent_id(component.id, &warmup_agent) {
+        delete_workers(&user, &[worker_id]).await;
+    }
+    cleanup_user_state(&user, &env.id).await;
+    deps.kill_all().await;
+
+    info!("Cloud pre-flight warmup complete.");
+}
diff --git a/integration-tests/src/benchmarks/cleanup.rs b/integration-tests/src/benchmarks/cleanup.rs
new file mode 100644
index 0000000000..2047b06c4d
--- /dev/null
+++ b/integration-tests/src/benchmarks/cleanup.rs
@@ -0,0 +1,529 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Cleanup helpers for cloud-perf benchmarks.
+//!
+//! The [`CleanupClient`] trait is the narrow interface used by the cascading
+//! cleanup logic, which enables unit-testing with the [`MockCleanupClient`]
+//! below.
+
+use async_trait::async_trait;
+use golem_client::api::RegistryServiceClient;
+use golem_common::model::environment::EnvironmentId;
+use golem_test_framework::config::dsl_impl::TestUserContext;
+use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies};
+use tracing::warn;
+use uuid::Uuid;
+
+// ── Narrow trait ─────────────────────────────────────────────────────────────
+
+/// Narrow client interface covering only the operations used by the cascading
+/// cleanup helpers.  Use [`RegistryCleanupAdapter`] to wrap a real client and
+/// [`MockCleanupClient`] (in tests) to inject failures.
+#[async_trait]
+pub trait CleanupClient: Send + Sync {
+    /// Returns `(component_id, revision)` pairs for all components in the env.
+    async fn list_env_components(&self, env_id: &Uuid) -> anyhow::Result<Vec<(Uuid, u64)>>;
+    async fn delete_component(&self, id: &Uuid, revision: u64) -> anyhow::Result<()>;
+
+    /// Returns domain-registration IDs for the env.
+    async fn list_env_domain_registrations(&self, env_id: &Uuid) -> anyhow::Result<Vec<Uuid>>;
+    async fn delete_domain_registration(&self, id: &Uuid) -> anyhow::Result<()>;
+
+    /// Returns `(application_id, env_revision)` for the environment.
+    async fn get_env_app_id_and_revision(&self, env_id: &Uuid) -> anyhow::Result<(Uuid, u64)>;
+    async fn delete_environment(&self, env_id: &Uuid, revision: u64) -> anyhow::Result<()>;
+
+    /// Returns the application's current revision.
+    async fn get_application_revision(&self, app_id: &Uuid) -> anyhow::Result<u64>;
+    async fn delete_application(&self, app_id: &Uuid, revision: u64) -> anyhow::Result<()>;
+
+    /// Returns the account's current revision.
+    async fn get_account_revision(&self, account_id: &Uuid) -> anyhow::Result<u64>;
+    async fn delete_account(&self, account_id: &Uuid, revision: u64) -> anyhow::Result<()>;
+}
+
+// ── Real adapter ─────────────────────────────────────────────────────────────
+
+/// Wraps any `RegistryServiceClient` implementor and bridges it to
+/// [`CleanupClient`].
+pub struct RegistryCleanupAdapter<C> {
+    inner: C,
+}
+
+impl<C: RegistryServiceClient + Send + Sync> RegistryCleanupAdapter<C> {
+    pub fn new(inner: C) -> Self {
+        Self { inner }
+    }
+}
+
+#[async_trait]
+impl<C: RegistryServiceClient + Send + Sync> CleanupClient for RegistryCleanupAdapter<C> {
+    async fn list_env_components(&self, env_id: &Uuid) -> anyhow::Result<Vec<(Uuid, u64)>> {
+        let page = self
+            .inner
+            .list_environment_components(env_id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))?;
+        Ok(page
+            .values
+            .into_iter()
+            .map(|c| (c.id.0, c.revision.into()))
+            .collect())
+    }
+
+    async fn delete_component(&self, id: &Uuid, revision: u64) -> anyhow::Result<()> {
+        self.inner
+            .delete_component(id, revision)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))
+    }
+
+    async fn list_env_domain_registrations(&self, env_id: &Uuid) -> anyhow::Result<Vec<Uuid>> {
+        let page = self
+            .inner
+            .list_environment_domain_registrations(env_id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))?;
+        Ok(page.values.into_iter().map(|dr| dr.id.0).collect())
+    }
+
+    async fn delete_domain_registration(&self, id: &Uuid) -> anyhow::Result<()> {
+        self.inner
+            .delete_domain_registration(id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))
+    }
+
+    async fn get_env_app_id_and_revision(&self, env_id: &Uuid) -> anyhow::Result<(Uuid, u64)> {
+        let env = self
+            .inner
+            .get_environment(env_id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))?;
+        Ok((env.application_id.0, env.revision.into()))
+    }
+
+    async fn delete_environment(&self, env_id: &Uuid, revision: u64) -> anyhow::Result<()> {
+        self.inner
+            .delete_environment(env_id, revision)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))
+    }
+
+    async fn get_application_revision(&self, app_id: &Uuid) -> anyhow::Result<u64> {
+        let app = self
+            .inner
+            .get_application(app_id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))?;
+        Ok(app.revision.into())
+    }
+
+    async fn delete_application(&self, app_id: &Uuid, revision: u64) -> anyhow::Result<()> {
+        self.inner
+            .delete_application(app_id, revision)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))
+    }
+
+    async fn get_account_revision(&self, account_id: &Uuid) -> anyhow::Result<u64> {
+        let account = self
+            .inner
+            .get_account(account_id)
+            .await
+            .map_err(|e| anyhow::anyhow!("{e:?}"))?;
+        Ok(account.revision.into())
+    }
+
+    async fn delete_account(&self, account_id: &Uuid, revision: u64) -> anyhow::Result<()> {
+        self.inner
+            .delete_account(account_id, revision)
+            .await
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!("{e:?}"))
+    }
+}
+
+// ── Core cleanup logic (testable via CleanupClient) ───────────────────────────
+
+/// Steps 1–4 of the cascading cleanup: components → domain registrations →
+/// environment → application.  Does **not** delete the account.
+///
+/// Every step is best-effort: failures are warned and cleanup continues.
+///
+/// **Note:** Server-side cascading delete is incomplete (golemcloud/golem#3291).
+pub async fn cleanup_env_and_app_with(client: &dyn CleanupClient, env_id: &Uuid) {
+    // Step 1: components
+    match client.list_env_components(env_id).await {
+        Ok(components) => {
+            for (cid, rev) in components {
+                if let Err(e) = client.delete_component(&cid, rev).await {
+                    warn!("cleanup: delete component {cid} failed (best-effort): {e:?}");
+                }
+            }
+        }
+        Err(e) => warn!("cleanup: list components for env {env_id} failed (best-effort): {e:?}"),
+    }
+
+    // Step 2: domain registrations
+    match client.list_env_domain_registrations(env_id).await {
+        Ok(ids) => {
+            for id in ids {
+                if let Err(e) = client.delete_domain_registration(&id).await {
+                    warn!("cleanup: delete domain registration {id} failed (best-effort): {e:?}");
+                }
+            }
+        }
+        Err(e) => {
+            warn!(
+                "cleanup: list domain registrations for env {env_id} failed \
+                 (best-effort): {e:?}"
+            )
+        }
+    }
+
+    // Step 3: environment (also captures app_id for step 4)
+    let app_id = match client.get_env_app_id_and_revision(env_id).await {
+        Ok((app_id, rev)) => {
+            if let Err(e) = client.delete_environment(env_id, rev).await {
+                warn!("cleanup: delete environment {env_id} failed (best-effort): {e:?}");
+            }
+            Some(app_id)
+        }
+        Err(e) => {
+            warn!("cleanup: get environment {env_id} failed (best-effort): {e:?}");
+            None
+        }
+    };
+
+    // Step 4: application (only when app_id is known from step 3)
+    if let Some(app_id) = app_id {
+        match client.get_application_revision(&app_id).await {
+            Ok(rev) => {
+                if let Err(e) = client.delete_application(&app_id, rev).await {
+                    warn!("cleanup: delete application {app_id} failed (best-effort): {e:?}");
+                }
+            }
+            Err(e) => {
+                warn!("cleanup: get application {app_id} failed (best-effort): {e:?}")
+            }
+        }
+    }
+}
+
+/// Step 5 of the cascading cleanup: deletes the user account.
+pub async fn cleanup_account_with(client: &dyn CleanupClient, account_id: &Uuid) {
+    match client.get_account_revision(account_id).await {
+        Ok(rev) => {
+            if let Err(e) = client.delete_account(account_id, rev).await {
+                warn!("cleanup: delete account {account_id} failed (best-effort): {e:?}");
+            }
+        }
+        Err(e) => {
+            warn!("cleanup: get account {account_id} failed (best-effort): {e:?}")
+        }
+    }
+}
+
+// ── High-level wrappers (take a TestUserContext) ──────────────────────────────
+
+/// Steps 1–4: components, domain registrations, environment, application.
+///
+/// For benchmarks whose iterations create one user with multiple envs/apps
+/// (e.g. cold-start-unknown), call this once per env then call
+/// [`cleanup_account`] once at the end.
+pub async fn cleanup_env_and_app(
+    user: &TestUserContext<BenchmarkTestDependencies>,
+    env_id: &EnvironmentId,
+) {
+    let client = user.deps.registry_service().client(&user.token).await;
+    let adapter = RegistryCleanupAdapter::new(client);
+    cleanup_env_and_app_with(&adapter, &env_id.0).await;
+}
+
+/// Step 5: deletes the user account.
+pub async fn cleanup_account(user: &TestUserContext<BenchmarkTestDependencies>) {
+    let client = user.deps.registry_service().client(&user.token).await;
+    let adapter = RegistryCleanupAdapter::new(client);
+    cleanup_account_with(&adapter, &user.account_id.0).await;
+}
+
+/// Convenience wrapper for the common single-env-per-user case:
+/// [`cleanup_env_and_app`] followed by [`cleanup_account`].
+pub async fn cleanup_user_state(
+    user: &TestUserContext<BenchmarkTestDependencies>,
+    env_id: &EnvironmentId,
+) {
+    cleanup_env_and_app(user, env_id).await;
+    cleanup_account(user).await;
+}
+
+// ── Unit tests ────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+pub mod tests {
+    use super::*;
+    use std::collections::HashSet;
+    use std::sync::{Arc, Mutex};
+    use test_r::test;
+
+    fn block_on<F: std::future::Future>(f: F) -> F::Output {
+        tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap()
+            .block_on(f)
+    }
+
+    /// In-process mock that records every operation attempted and fails the
+    /// operations listed in `fail_ops`.
+    pub struct MockCleanupClient {
+        fail_ops: HashSet<&'static str>,
+        /// Ordered log of every operation attempted.
+        pub calls: Arc<Mutex<Vec<&'static str>>>,
+        /// The `application_id` returned by `get_env_app_id_and_revision`
+        /// (used to verify step-4 precondition propagation in tests).
+        pub app_id: Uuid,
+    }
+
+    impl MockCleanupClient {
+        pub fn new(fail_ops: &[&'static str]) -> (Self, Arc<Mutex<Vec<&'static str>>>) {
+            let calls = Arc::new(Mutex::new(Vec::new()));
+            let mock = Self {
+                fail_ops: fail_ops.iter().copied().collect(),
+                calls: calls.clone(),
+                app_id: Uuid::new_v4(),
+            };
+            (mock, calls)
+        }
+
+        fn record(&self, name: &'static str) {
+            self.calls.lock().unwrap().push(name);
+        }
+
+        fn result(&self, name: &'static str) -> anyhow::Result<()> {
+            self.record(name);
+            if self.fail_ops.contains(name) {
+                Err(anyhow::anyhow!("simulated failure in {name}"))
+            } else {
+                Ok(())
+            }
+        }
+    }
+
+    #[async_trait]
+    impl CleanupClient for MockCleanupClient {
+        async fn list_env_components(&self, _: &Uuid) -> anyhow::Result<Vec<(Uuid, u64)>> {
+            self.record("list_env_components");
+            if self.fail_ops.contains("list_env_components") {
+                Err(anyhow::anyhow!("simulated failure"))
+            } else {
+                Ok(vec![(Uuid::new_v4(), 0)])
+            }
+        }
+
+        async fn delete_component(&self, _: &Uuid, _: u64) -> anyhow::Result<()> {
+            self.result("delete_component")
+        }
+
+        async fn list_env_domain_registrations(&self, _: &Uuid) -> anyhow::Result<Vec<Uuid>> {
+            self.record("list_env_domain_registrations");
+            if self.fail_ops.contains("list_env_domain_registrations") {
+                Err(anyhow::anyhow!("simulated failure"))
+            } else {
+                Ok(vec![Uuid::new_v4()])
+            }
+        }
+
+        async fn delete_domain_registration(&self, _: &Uuid) -> anyhow::Result<()> {
+            self.result("delete_domain_registration")
+        }
+
+        async fn get_env_app_id_and_revision(&self, _: &Uuid) -> anyhow::Result<(Uuid, u64)> {
+            self.record("get_env_app_id_and_revision");
+            if self.fail_ops.contains("get_env_app_id_and_revision") {
+                Err(anyhow::anyhow!("simulated failure"))
+            } else {
+                Ok((self.app_id, 1))
+            }
+        }
+
+        async fn delete_environment(&self, _: &Uuid, _: u64) -> anyhow::Result<()> {
+            self.result("delete_environment")
+        }
+
+        async fn get_application_revision(&self, _: &Uuid) -> anyhow::Result<u64> {
+            self.record("get_application_revision");
+            if self.fail_ops.contains("get_application_revision") {
+                Err(anyhow::anyhow!("simulated failure"))
+            } else {
+                Ok(1)
+            }
+        }
+
+        async fn delete_application(&self, _: &Uuid, _: u64) -> anyhow::Result<()> {
+            self.result("delete_application")
+        }
+
+        async fn get_account_revision(&self, _: &Uuid) -> anyhow::Result<u64> {
+            self.record("get_account_revision");
+            if self.fail_ops.contains("get_account_revision") {
+                Err(anyhow::anyhow!("simulated failure"))
+            } else {
+                Ok(1)
+            }
+        }
+
+        async fn delete_account(&self, _: &Uuid, _: u64) -> anyhow::Result<()> {
+            self.result("delete_account")
+        }
+    }
+
+    // ── Test helpers ──────────────────────────────────────────────────────────
+
+    fn all_ops() -> Vec<&'static str> {
+        vec![
+            "list_env_components",
+            "delete_component",
+            "list_env_domain_registrations",
+            "delete_domain_registration",
+            "get_env_app_id_and_revision",
+            "delete_environment",
+            "get_application_revision",
+            "delete_application",
+            "get_account_revision",
+            "delete_account",
+        ]
+    }
+
+    fn run(mock: &MockCleanupClient) {
+        let env_id = Uuid::new_v4();
+        let account_id = Uuid::new_v4();
+        block_on(async {
+            cleanup_env_and_app_with(mock, &env_id).await;
+            cleanup_account_with(mock, &account_id).await;
+        });
+    }
+
+    fn contains(calls: &[&str], op: &str) -> bool {
+        calls.contains(&op)
+    }
+
+    // ── Tests ─────────────────────────────────────────────────────────────────
+
+    #[test]
+    fn all_steps_run_on_success() {
+        let (mock, calls) = MockCleanupClient::new(&[]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        for op in all_ops() {
+            assert!(
+                contains(&calls, op),
+                "expected '{op}' to be called; got: {calls:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn step1_list_failure_continues() {
+        let (mock, calls) = MockCleanupClient::new(&["list_env_components"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(
+            contains(&calls, "list_env_domain_registrations"),
+            "{calls:?}"
+        );
+        assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}");
+        assert!(contains(&calls, "get_account_revision"), "{calls:?}");
+    }
+
+    #[test]
+    fn step2_list_failure_continues() {
+        let (mock, calls) = MockCleanupClient::new(&["list_env_domain_registrations"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}");
+        assert!(contains(&calls, "get_account_revision"), "{calls:?}");
+    }
+
+    /// `get_env_app_id_and_revision` (step 3 get) fails → step 4 is skipped
+    /// (no app_id available) but step 5 still runs.
+    #[test]
+    fn step3_get_failure_skips_step4_runs_step5() {
+        let (mock, calls) = MockCleanupClient::new(&["get_env_app_id_and_revision"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(
+            !contains(&calls, "get_application_revision"),
+            "step 4 must be skipped when step 3 get fails; got: {calls:?}"
+        );
+        assert!(
+            contains(&calls, "get_account_revision"),
+            "step 5 must still run; got: {calls:?}"
+        );
+    }
+
+    /// `delete_environment` fails but get succeeded, so app_id is available:
+    /// step 4 and step 5 both run.
+    #[test]
+    fn step3_delete_failure_still_runs_step4_and_step5() {
+        let (mock, calls) = MockCleanupClient::new(&["delete_environment"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(contains(&calls, "get_application_revision"), "{calls:?}");
+        assert!(contains(&calls, "get_account_revision"), "{calls:?}");
+    }
+
+    #[test]
+    fn step4_failure_continues_to_step5() {
+        let (mock, calls) = MockCleanupClient::new(&["get_application_revision"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(
+            contains(&calls, "get_account_revision"),
+            "step 5 should run after step 4 failure; got: {calls:?}"
+        );
+    }
+
+    /// `get_account_revision` (step 5 get) fails → function completes without
+    /// panic and `delete_account` is not attempted.
+    #[test]
+    fn step5_get_failure_no_delete_and_completes() {
+        let (mock, calls) = MockCleanupClient::new(&["get_account_revision"]);
+        run(&mock);
+        let calls = calls.lock().unwrap().clone();
+        assert!(contains(&calls, "get_account_revision"), "{calls:?}");
+        assert!(
+            !contains(&calls, "delete_account"),
+            "delete_account must not run when get fails; got: {calls:?}"
+        );
+    }
+
+    /// All steps fail simultaneously — function completes without panic and
+    /// every unconditional step is attempted.
+    #[test]
+    fn all_steps_fail_no_short_circuit() {
+        let (mock, calls) = MockCleanupClient::new(&all_ops());
+        run(&mock); // must not panic
+        let calls = calls.lock().unwrap().clone();
+        assert!(contains(&calls, "list_env_components"), "{calls:?}");
+        assert!(
+            contains(&calls, "list_env_domain_registrations"),
+            "{calls:?}"
+        );
+        assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}");
+        assert!(contains(&calls, "get_account_revision"), "{calls:?}");
+    }
+}
diff --git a/integration-tests/src/benchmarks/cold_start_unknown.rs b/integration-tests/src/benchmarks/cold_start_unknown.rs
index f29f297658..592b80e2e4 100644
--- a/integration-tests/src/benchmarks/cold_start_unknown.rs
+++ b/integration-tests/src/benchmarks/cold_start_unknown.rs
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::benchmarks::{delete_workers, invoke_and_await_agent};
+use crate::benchmarks::{
+    cleanup_account, cleanup_env_and_app, delete_workers, invoke_and_await_agent,
+};
 use async_trait::async_trait;
 use futures_concurrency::future::Join;
 use golem_common::base_model::agent::ParsedAgentId;
 use golem_common::model::AgentId;
 use golem_common::model::component::ComponentDto;
+use golem_common::model::environment::EnvironmentId;
 use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig};
 use golem_test_framework::config::benchmark::TestMode;
@@ -196,6 +199,8 @@ impl Benchmark for ColdStartUnknownMedium {
 pub struct IterationContext {
     user: TestUserContext<BenchmarkTestDependencies>,
     agents: Vec<(ComponentDto, ParsedAgentId)>,
+    /// One env_id per size (cold_start creates one env per component).
+    env_ids: Vec<EnvironmentId>,
 }
 
 pub struct ColdStartUnknownBenchmark {
@@ -235,11 +240,13 @@ impl ColdStartUnknownBenchmark {
     pub async fn setup_iteration(&self, config: &RunConfig) -> IterationContext {
         let user = self.deps.user().await.unwrap();
         let mut agents = vec![];
+        let mut env_ids = vec![];
 
         for _ in 0..config.size {
             // Agent types names are unique within one environment,
             // so make sure each component get its own env
             let (_, env) = user.app_and_env().await.unwrap();
+            env_ids.push(env.id);
 
             let component = user
                 .component(&env.id, &self.component_name)
@@ -252,7 +259,11 @@ impl ColdStartUnknownBenchmark {
             agents.push((component, agent_id));
         }
 
-        IterationContext { user, agents }
+        IterationContext {
+            user,
+            agents,
+            env_ids,
+        }
     }
 
     pub async fn warmup(&self, config: &RunConfig) {
@@ -298,6 +309,14 @@ impl ColdStartUnknownBenchmark {
             .iter()
             .filter_map(|(component, agent_id)| AgentId::from_agent_id(component.id, agent_id).ok())
             .collect();
-        delete_workers(&iteration.user, &agent_ids).await
+        delete_workers(&iteration.user, &agent_ids).await;
+        // Clean up each env/app individually, then delete the account once.
+        // This avoids the account being deleted on the first env cleanup and
+        // causing subsequent cleanup calls to fail (since the user token would
+        // be invalid after account deletion).
+        for env_id in &iteration.env_ids {
+            cleanup_env_and_app(&iteration.user, env_id).await;
+        }
+        cleanup_account(&iteration.user).await;
     }
 }
diff --git a/integration-tests/src/benchmarks/durability_overhead.rs b/integration-tests/src/benchmarks/durability_overhead.rs
index f956eb3636..fb864fd44c 100644
--- a/integration-tests/src/benchmarks/durability_overhead.rs
+++ b/integration-tests/src/benchmarks/durability_overhead.rs
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::benchmarks::{delete_workers, invoke_and_await_agent};
+use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent};
 use async_trait::async_trait;
 use futures_concurrency::future::Join;
 use golem_common::base_model::agent::ParsedAgentId;
 use golem_common::model::AgentId;
 use golem_common::model::component::{ComponentDto, ComponentId};
+use golem_common::model::environment::EnvironmentId;
 use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig};
 use golem_test_framework::config::benchmark::TestMode;
@@ -42,6 +43,7 @@ pub struct DurabilityOverheadIterationContext {
     durable_nonpersistent_agent_ids: Vec<ParsedAgentId>,
     ephemeral_agent_ids: Vec<ParsedAgentId>,
     durable_persistent_commit_agent_ids: Vec<ParsedAgentId>,
+    env_id: EnvironmentId,
 }
 
 fn agent_ids_to_agent_ids(component_id: ComponentId, agent_ids: &[ParsedAgentId]) -> Vec<AgentId> {
@@ -146,6 +148,7 @@ impl Benchmark for DurabilityOverhead {
             durable_nonpersistent_agent_ids,
             ephemeral_agent_ids,
             durable_persistent_commit_agent_ids,
+            env_id: env.id,
         }
     }
 
@@ -336,5 +339,6 @@ impl Benchmark for DurabilityOverhead {
             ),
         )
         .await;
+        cleanup_user_state(&context.user, &context.env_id).await;
     }
 }
diff --git a/integration-tests/src/benchmarks/latency.rs b/integration-tests/src/benchmarks/latency.rs
index a44ff42333..006d29f228 100644
--- a/integration-tests/src/benchmarks/latency.rs
+++ b/integration-tests/src/benchmarks/latency.rs
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::benchmarks::{delete_workers, invoke_and_await_agent};
+use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent};
 use async_trait::async_trait;
 use futures_concurrency::future::Join;
 use golem_common::base_model::agent::ParsedAgentId;
 use golem_common::model::AgentId;
 use golem_common::model::component::ComponentDto;
+use golem_common::model::environment::EnvironmentId;
 use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig};
 use golem_test_framework::config::benchmark::TestMode;
@@ -200,6 +201,7 @@ pub struct IterationContext {
     component: ComponentDto,
     agent_ids: Vec<ParsedAgentId>,
     length: usize,
+    env_id: EnvironmentId,
 }
 
 pub struct LatencyBenchmark {
@@ -261,6 +263,7 @@ impl LatencyBenchmark {
             component,
             agent_ids,
             length: config.length,
+            env_id: env.id,
         }
     }
 
@@ -326,6 +329,7 @@ impl LatencyBenchmark {
             .iter()
             .filter_map(|agent_id| AgentId::from_agent_id(iteration.component.id, agent_id).ok())
             .collect();
-        delete_workers(&iteration.user, &agent_ids).await
+        delete_workers(&iteration.user, &agent_ids).await;
+        cleanup_user_state(&iteration.user, &iteration.env_id).await;
     }
 }
diff --git a/integration-tests/src/benchmarks/mod.rs b/integration-tests/src/benchmarks/mod.rs
index b15dde89a3..0682055643 100644
--- a/integration-tests/src/benchmarks/mod.rs
+++ b/integration-tests/src/benchmarks/mod.rs
@@ -29,15 +29,20 @@ use std::time::{Duration, SystemTime};
 use tracing::{Instrument, info, warn};
 use tracing_opentelemetry::OpenTelemetrySpanExt;
 
+pub mod cleanup;
 pub mod cold_start_unknown;
 pub mod durability_overhead;
 pub mod latency;
 pub mod sleep;
 pub mod throughput;
+pub mod throughput_saturation;
 
-/// Injects the current tracing span's OpenTelemetry trace context (traceparent/tracestate)
-/// into a reqwest Request's headers so that downstream services can link their
-/// spans to the benchmark's trace.
+// Re-export cleanup helpers so callers can use the flat `benchmarks::*` path.
+pub use cleanup::{cleanup_account, cleanup_env_and_app, cleanup_user_state};
+
+/// Injects the current tracing span's OpenTelemetry trace context
+/// (traceparent/tracestate) into a reqwest Request's headers so that
+/// downstream services can link their spans to the benchmark's trace.
 fn inject_trace_context(request: &mut Request) {
     let current_span = tracing::Span::current();
     let otel_context = current_span.context();
diff --git a/integration-tests/src/benchmarks/sleep.rs b/integration-tests/src/benchmarks/sleep.rs
index 97bb64e16f..457872ed29 100644
--- a/integration-tests/src/benchmarks/sleep.rs
+++ b/integration-tests/src/benchmarks/sleep.rs
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::benchmarks::delete_workers;
+use crate::benchmarks::{cleanup_user_state, delete_workers};
 use async_trait::async_trait;
 use futures_concurrency::future::Join;
 use golem_common::base_model::agent::ParsedAgentId;
 use golem_common::model::AgentId;
 use golem_common::model::component::ComponentDto;
+use golem_common::model::environment::EnvironmentId;
 use golem_common::{agent_id, data_value};
 use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig};
 use golem_test_framework::config::benchmark::TestMode;
@@ -39,6 +40,7 @@ pub struct SleepIterationContext {
     user: TestUserContext<BenchmarkTestDependencies>,
     component: ComponentDto,
     agent_ids: Vec<ParsedAgentId>,
+    env_id: EnvironmentId,
 }
 
 #[async_trait]
@@ -111,6 +113,7 @@ impl Benchmark for Sleep {
             user,
             component,
             agent_ids,
+            env_id: env.id,
         }
     }
 
@@ -184,6 +187,7 @@ impl Benchmark for Sleep {
             .iter()
             .filter_map(|agent_id| AgentId::from_agent_id(context.component.id, agent_id).ok())
             .collect();
-        delete_workers(&context.user, &agent_ids).await
+        delete_workers(&context.user, &agent_ids).await;
+        cleanup_user_state(&context.user, &context.env_id).await;
     }
 }
diff --git a/integration-tests/src/benchmarks/throughput.rs b/integration-tests/src/benchmarks/throughput.rs
index 9cdecd7a1f..f3552e0eee 100644
--- a/integration-tests/src/benchmarks/throughput.rs
+++ b/integration-tests/src/benchmarks/throughput.rs
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::benchmarks::{delete_workers, invoke_and_await_agent, invoke_and_await_http};
+use crate::benchmarks::{
+    cleanup_user_state, delete_workers, invoke_and_await_agent, invoke_and_await_http,
+};
 use async_trait::async_trait;
 use axum::http::{HeaderMap, HeaderValue};
 use futures_concurrency::future::Join;
@@ -21,12 +23,13 @@ use golem_common::base_model::agent::{DataValue, ParsedAgentId};
 use golem_common::model::agent::AgentTypeName;
 use golem_common::model::component::{ComponentDto, ComponentId};
 use golem_common::model::domain_registration::{Domain, DomainRegistrationCreation};
+use golem_common::model::environment::EnvironmentId;
 use golem_common::model::http_api_deployment::{
     HttpApiDeploymentAgentOptions, HttpApiDeploymentCreation,
 };
 use golem_common::model::{AgentId, RoutingTable};
 use golem_common::{agent_id, data_value};
-use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig};
+use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, ResultKey, RunConfig};
 use golem_test_framework::config::benchmark::TestMode;
 use golem_test_framework::config::dsl_impl::TestUserContext;
 use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies};
@@ -35,6 +38,7 @@ use indoc::indoc;
 use reqwest::{Body, Method, Request, Url};
 use serde_json::json;
 use std::collections::BTreeMap;
+use std::time::Instant;
 use tracing::{Instrument, Level, info};
 
 pub struct ThroughputEcho {
@@ -79,16 +83,14 @@ impl Benchmark for ThroughputEcho {
             "echo",
             "echo",
             Box::new(|_| data_value!("benchmark")),
-            Box::new(|port, idx, _length| {
-                let url = Url::parse(&format!(
-                    "http://localhost:{port}/test-{idx}-http/echo/test-message"
-                ))
-                .unwrap();
+            Box::new(|base_url, idx, _length| {
+                let url =
+                    Url::parse(&format!("{base_url}/test-{idx}-http/echo/test-message")).unwrap();
                 Request::new(Method::POST, url)
             }),
-            Box::new(|port, idx, _length| {
+            Box::new(|base_url, idx, _length| {
                 let url = Url::parse(&format!(
-                    "http://localhost:{port}/rust/test-{idx}-http/echo/test-message"
+                    "{base_url}/rust/test-{idx}-http/echo/test-message"
                 ))
                 .unwrap();
                 Request::new(Method::POST, url)
@@ -179,21 +181,16 @@ impl Benchmark for ThroughputLargeInput {
                 let bytes = vec![0u8; length];
                 data_value!(bytes)
             }),
-            Box::new(|port, idx, length| {
-                let url = Url::parse(&format!(
-                    "http://localhost:{port}/test-{idx}-http/large-input"
-                ))
-                .unwrap();
+            Box::new(|base_url, idx, length| {
+                let url = Url::parse(&format!("{base_url}/test-{idx}-http/large-input")).unwrap();
                 let json_body = json!({"input": vec![0u8; length]}).to_string();
                 let mut request = Request::new(Method::POST, url);
                 *request.body_mut() = Some(Body::wrap(json_body));
                 request
             }),
-            Box::new(|port, idx, length| {
-                let url = Url::parse(&format!(
-                    "http://localhost:{port}/rust/test-{idx}-http/large-input"
-                ))
-                .unwrap();
+            Box::new(|base_url, idx, length| {
+                let url =
+                    Url::parse(&format!("{base_url}/rust/test-{idx}-http/large-input")).unwrap();
                 let json_body = json!({"input": vec![0u8; length]}).to_string();
                 let mut request = Request::new(Method::POST, url);
                 *request.body_mut() = Some(Body::wrap(json_body));
@@ -282,21 +279,16 @@ impl Benchmark for ThroughputCpuIntensive {
             "cpu_intensive",
             "cpuIntensive",
             Box::new(|length| data_value!(length as f64)),
-            Box::new(|port, idx, length| {
-                let url = Url::parse(&format!(
-                    "http://localhost:{port}/test-{idx}-http/cpu-intensive"
-                ))
-                .unwrap();
+            Box::new(|base_url, idx, length| {
+                let url = Url::parse(&format!("{base_url}/test-{idx}-http/cpu-intensive")).unwrap();
                 let json_body = json!({"length": length}).to_string();
                 let mut request = Request::new(Method::POST, url);
                 *request.body_mut() = Some(Body::wrap(json_body));
                 request
             }),
-            Box::new(|port, idx, length| {
-                let url = Url::parse(&format!(
-                    "http://localhost:{port}/rust/test-{idx}-http/cpu-intensive"
-                ))
-                .unwrap();
+            Box::new(|base_url, idx, length| {
+                let url =
+                    Url::parse(&format!("{base_url}/rust/test-{idx}-http/cpu-intensive")).unwrap();
                 let json_body = json!({"length": length}).to_string();
                 let mut request = Request::new(Method::POST, url);
                 *request.body_mut() = Some(Body::wrap(json_body));
@@ -402,14 +394,20 @@ impl AgentInvocationTarget {
         }
     }
 
-    pub fn prefix(&self, prefix: &str, routing_table: &RoutingTable) -> String {
+    pub fn prefix(&self, prefix: &str, routing_table: &Option<RoutingTable>) -> String {
         match self {
             AgentInvocationTarget::Single { .. } => prefix.to_string(),
             AgentInvocationTarget::Pair { pair, .. } => {
-                if pair.at_same_worker_executor(routing_table) {
-                    format!("{prefix}local-")
+                if let Some(rt) = routing_table {
+                    if pair.at_same_worker_executor(rt) {
+                        format!("{prefix}local-")
+                    } else {
+                        format!("{prefix}remote-")
+                    }
                 } else {
-                    format!("{prefix}remote-")
+                    // Routing table not available (no shard-manager port-forward
+                    // configured); all RPC pairs go into a single unlabeled bucket.
+                    prefix.to_string()
                 }
             }
         }
@@ -426,19 +424,35 @@ pub struct IterationContext {
     rust_agent_ids_for_http: Vec<ParsedAgentId>,
     ts_agent_ids_for_http: Vec<ParsedAgentId>,
     length: usize,
-    routing_table: RoutingTable,
+    /// `None` when shard-manager host/port are not configured (cloud mode
+    /// without port-forward). When `None`, RPC pairs go into a single unlabeled
+    /// bucket instead of being split into local/remote.
+    routing_table: Option<RoutingTable>,
     ts_rpc_agent_id_pairs: Vec<AgentIdPair>,
     rust_rpc_agent_id_pairs: Vec<AgentIdPair>,
+    env_id: EnvironmentId,
 }
 
+/// Type for HTTP request builder closures used by the throughput benchmark.
+/// Receives `(base_url, agent_index, length)` where `base_url` is the full
+/// scheme+host+port prefix (e.g. `http://localhost:8084` in local mode or
+/// `https://myenv.apps.golem.dev` in cloud mode).
+type HttpRequestFn = Box<dyn for<'a> Fn(&'a str, usize, usize) -> Request + Send + Sync + 'static>;
+
 pub struct ThroughputBenchmark {
     rust_method_name: String,
     ts_method_name: String,
     agent_params: Box<dyn Fn(usize) -> DataValue + Send + Sync + 'static>,
-    http_request: Box<dyn Fn(u16, usize, usize) -> Request + Send + Sync + 'static>,
-    rust_http_request: Box<dyn Fn(u16, usize, usize) -> Request + Send + Sync + 'static>,
+    http_request: HttpRequestFn,
+    rust_http_request: HttpRequestFn,
     deps: BenchmarkTestDependencies,
     call_count: usize,
+    /// Pre-built HTTP client for cloud-mode apps-domain calls
+    /// (`https://{env_id}.{apps_base_domain}`).  Cached here so the
+    /// connection pool is warm across benchmark iterations.
+    /// `None` in local/provided mode (client is built per-iteration from the
+    /// custom-request port with a Host header override).
+    cloud_http_client: Option<reqwest::Client>,
 }
 
 fn agent_ids_to_agent_ids(component_id: ComponentId, ids: &[ParsedAgentId]) -> Vec<AgentId> {
@@ -447,13 +461,38 @@ fn agent_ids_to_agent_ids(component_id: ComponentId, ids: &[ParsedAgentId]) -> V
         .collect()
 }
 
+/// Records aggregate throughput (invocations per second) for a measurement
+/// block as a `count` result under the key `{prefix}throughput-ops-per-sec`.
+///
+/// `total_calls` is the total number of invocations issued across all targets
+/// in the block; `elapsed` is the wall-clock duration of the concurrently
+/// executed block. Throughput is therefore the realised aggregate rate the
+/// cluster sustained for this implementation, not a per-call latency.
+fn record_throughput(
+    recorder: &BenchmarkRecorder,
+    prefix: &str,
+    total_calls: usize,
+    elapsed: std::time::Duration,
+) {
+    let secs = elapsed.as_secs_f64();
+    if secs <= 0.0 || total_calls == 0 {
+        return;
+    }
+    let ops_per_sec = (total_calls as f64 / secs).round() as u64;
+    info!("{prefix}throughput: {total_calls} calls in {secs:.3}s = {ops_per_sec} ops/sec");
+    recorder.count(
+        &ResultKey::primary(format!("{prefix}throughput-ops-per-sec")),
+        ops_per_sec,
+    );
+}
+
 impl ThroughputBenchmark {
     pub async fn new(
         rust_method_name: &str,
         ts_method_name: &str,
         agent_params: Box<dyn Fn(usize) -> DataValue + Send + Sync + 'static>,
-        http_request: Box<dyn Fn(u16, usize, usize) -> Request + Send + Sync + 'static>,
-        rust_http_request: Box<dyn Fn(u16, usize, usize) -> Request + Send + Sync + 'static>,
+        http_request: HttpRequestFn,
+        rust_http_request: HttpRequestFn,
         mode: &TestMode,
         verbosity: Level,
         cluster_size: usize,
@@ -461,21 +500,40 @@ impl ThroughputBenchmark {
         call_count: usize,
         otlp: bool,
     ) -> Self {
+        let deps = BenchmarkTestDependencies::new(
+            mode,
+            verbosity,
+            cluster_size,
+            disable_compilation_cache,
+            otlp,
+        )
+        .await;
+
+        // Build the cloud HTTP client once so the connection pool stays alive
+        // across all benchmark iterations.  In cloud mode requests go to
+        // https://{env_id}.{apps_base_domain}, so we use standard TLS with
+        // ALPN negotiation — NOT http2_prior_knowledge() which is for h2c
+        // (cleartext HTTP/2) and would bypass the ALPN step that the NLB
+        // terminating TLS expects.
+        let cloud_http_client = deps.apps_base_domain().map(|_| {
+            reqwest::ClientBuilder::new()
+                .pool_max_idle_per_host(1024)
+                .pool_idle_timeout(std::time::Duration::from_secs(90))
+                .tcp_nodelay(true)
+                .timeout(std::time::Duration::from_secs(180))
+                .build()
+                .expect("Failed to build cloud HTTP client for throughput benchmark")
+        });
+
         Self {
             rust_method_name: rust_method_name.to_string(),
             ts_method_name: ts_method_name.to_string(),
             agent_params,
             http_request,
             rust_http_request,
-            deps: BenchmarkTestDependencies::new(
-                mode,
-                verbosity,
-                cluster_size,
-                disable_compilation_cache,
-                otlp,
-            )
-            .await,
+            deps,
             call_count,
+            cloud_http_client,
         }
     }
 
@@ -491,13 +549,23 @@ impl ThroughputBenchmark {
         let mut ts_rpc_agent_id_pairs = vec![];
         let mut rust_rpc_agent_id_pairs = vec![];
 
-        let routing_table = self
-            .deps
-            .shard_manager()
-            .get_routing_table()
-            .await
-            .expect("Failed to get routing table");
-        info!("Fetched routing table: {routing_table}");
+        // Fetch routing table when shard-manager is configured; fall back to
+        // None (unlabeled single-bucket RPC) when not configured (e.g. cloud
+        // mode without a port-forward to the shard-manager).
+        let routing_table: Option<RoutingTable> =
+            match self.deps.shard_manager().get_routing_table().await {
+                Ok(rt) => {
+                    info!("Fetched routing table: {rt}");
+                    Some(rt)
+                }
+                Err(err) => {
+                    info!(
+                        "Shard-manager not available, skipping routing table (RPC pairs \
+                         will be unlabeled): {err:#}"
+                    );
+                    None
+                }
+            };
 
         let user = self.deps.user().await.unwrap();
         let (_, env) = user.app_and_env().await.unwrap();
@@ -542,7 +610,14 @@ impl ThroughputBenchmark {
 
         let client = user.registry_service_client().await;
 
-        let domain = Domain(format!("{}.golem.cloud", env.id));
+        // In cloud mode, use the configured apps_base_domain. Fall back to
+        // "golem.cloud" for local/provided modes.
+        let apps_base_domain = self
+            .deps
+            .apps_base_domain()
+            .unwrap_or("golem.cloud")
+            .to_string();
+        let domain = Domain(format!("{}.{}", env.id, apps_base_domain));
 
         async {
             client
@@ -605,6 +680,7 @@ impl ThroughputBenchmark {
             routing_table,
             ts_rpc_agent_id_pairs,
             rust_rpc_agent_id_pairs,
+            env_id: env.id,
         }
     }
 
@@ -713,7 +789,7 @@ impl ThroughputBenchmark {
     pub async fn run(&self, iteration: &IterationContext, recorder: BenchmarkRecorder) {
         async fn measure_agents(
             user: &TestUserContext<BenchmarkTestDependencies>,
-            routing_table: &RoutingTable,
+            routing_table: &Option<RoutingTable>,
             recorder: &BenchmarkRecorder,
             length: usize,
             call_count: usize,
@@ -746,7 +822,10 @@ impl ThroughputBenchmark {
                 })
                 .collect::<Vec<_>>();
 
+            let started = Instant::now();
             let results = result_futures.join().await;
+            let elapsed = started.elapsed();
+            record_throughput(recorder, prefix, targets.len() * call_count, elapsed);
             for (idx, (results, target)) in results.iter().zip(targets).enumerate() {
                 let prefix = target.prefix(prefix, routing_table);
                 for result in results {
@@ -799,31 +878,51 @@ impl ThroughputBenchmark {
         .instrument(tracing::info_span!("measure_ts_agents"))
         .await;
 
-        let port = self.deps.worker_service().custom_request_port();
-
-        let client = {
-            let mut headers = HeaderMap::new();
-            headers.insert("Host", HeaderValue::from_str(&iteration.domain.0).unwrap());
-            reqwest::Client::builder()
-                .default_headers(headers)
-                .build()
-                .expect("Failed to create HTTP client")
-        };
+        // Resolve the base URL prefix and HTTP client for the code-first HTTP
+        // API benchmark paths. The request-builder closures append the route
+        // path (e.g. "/test-0-http/echo/...") to this prefix.
+        //
+        //   cloud mode:  base = "https://{env_id}.apps.dev.golem.cloud"
+        //                → reqwest connects directly to that host (TLS/SNI +
+        //                  Host set from the URL); the apps gateway routes it
+        //                  to worker-service. Uses the cached, pool-warm client.
+        //
+        //   local mode:  base = "http://localhost:{custom_request_port}"
+        //                → reqwest connects to localhost; an explicit Host
+        //                  header ("{env_id}.golem.cloud") tells the local
+        //                  worker-service which deployment to route to.
+        let (http_base_url, client): (String, reqwest::Client) =
+            if let Some(ref cached) = self.cloud_http_client {
+                let base = format!("https://{}", iteration.domain.0);
+                (base, cached.clone())
+            } else {
+                let port = self.deps.worker_service().custom_request_port();
+                let base = format!("http://localhost:{port}");
+                let mut headers = HeaderMap::new();
+                headers.insert("Host", HeaderValue::from_str(&iteration.domain.0).unwrap());
+                let c = reqwest::Client::builder()
+                    .default_headers(headers)
+                    .build()
+                    .expect("Failed to create HTTP client");
+                (base, c)
+            };
 
         async {
             let client = client.clone();
+            let base = http_base_url.clone();
             let result_futures = iteration
                 .rust_agent_ids_for_http
                 .iter()
                 .enumerate()
                 .map(move |(idx, _agent_id)| {
                     let client = client.clone();
+                    let base = base.clone();
                     async move {
                         let mut results = vec![];
                         for _ in 0..self.call_count {
                             results.push(
                                 invoke_and_await_http(client.clone(), || {
-                                    (self.rust_http_request)(port, idx, iteration.length)
+                                    (self.rust_http_request)(&base, idx, iteration.length)
                                 })
                                 .await,
                             )
@@ -833,7 +932,15 @@ impl ThroughputBenchmark {
                 })
                 .collect::<Vec<_>>();
 
+            let started = Instant::now();
             let results = result_futures.join().await;
+            let elapsed = started.elapsed();
+            record_throughput(
+                &recorder,
+                "rust-agent-http-",
+                iteration.rust_agent_ids_for_http.len() * self.call_count,
+                elapsed,
+            );
             for (idx, results) in results.iter().enumerate() {
                 for result in results {
                     result.record(&recorder, "rust-agent-http-", idx.to_string().as_str());
@@ -850,12 +957,13 @@ impl ThroughputBenchmark {
                 .enumerate()
                 .map(move |(idx, _agent_id)| {
                     let client = client.clone();
+                    let base = http_base_url.clone();
                     async move {
                         let mut results = vec![];
                         for _ in 0..self.call_count {
                             results.push(
                                 invoke_and_await_http(client.clone(), || {
-                                    (self.http_request)(port, idx, iteration.length)
+                                    (self.http_request)(&base, idx, iteration.length)
                                 })
                                 .await,
                             )
@@ -865,7 +973,15 @@ impl ThroughputBenchmark {
                 })
                 .collect::<Vec<_>>();
 
+            let started = Instant::now();
             let results = result_futures.join().await;
+            let elapsed = started.elapsed();
+            record_throughput(
+                &recorder,
+                "ts-agent-http-",
+                iteration.ts_agent_ids_for_http.len() * self.call_count,
+                elapsed,
+            );
             for (idx, results) in results.iter().enumerate() {
                 for result in results {
                     result.record(&recorder, "ts-agent-http-", idx.to_string().as_str());
@@ -969,5 +1085,6 @@ impl ThroughputBenchmark {
             }
         }
         delete_workers(&iteration.user, &rust_rpc_workers).await;
+        cleanup_user_state(&iteration.user, &iteration.env_id).await;
     }
 }
diff --git a/integration-tests/src/benchmarks/throughput_saturation.rs b/integration-tests/src/benchmarks/throughput_saturation.rs
new file mode 100644
index 0000000000..768d8c7eb1
--- /dev/null
+++ b/integration-tests/src/benchmarks/throughput_saturation.rs
@@ -0,0 +1,425 @@
+// Copyright 2024-2026 Golem Cloud
+//
+// Licensed under the Golem Source License v1.1 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://license.golem.cloud/LICENSE
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Throughput-under-memory-saturation benchmarks.
+//!
+//! Unlike the regular throughput benchmark — which keeps `size` small enough
+//! that all workers fit comfortably in memory — these benchmarks deliberately
+//! ramp the number of *active* agents up to and past the executor's memory
+//! ceiling, to find the knee: the agent count where the pod can still keep
+//! everything resident (latency flat, throughput scaling linearly) just before
+//! it starts evicting and replaying (latency spikes, throughput craters).
+//!
+//! The measured `run` phase drives sustained load over a fixed window: each
+//! agent repeatedly does a short unit of work then goes idle for [`IDLE_GAP`].
+//! During that gap the agent has no in-flight work and becomes a `LoadedIdle`
+//! eviction candidate, so under memory pressure it can be evicted and then must
+//! reload (oplog replay + re-admission) on its next call — the churn that makes
+//! throughput crater past the knee. Starts are staggered so the fleet is not
+//! synchronised.
+//!
+//! Three variants:
+//! - `throughput-saturation-counters`: agent-counters with a synthetic,
+//!   per-agent-distinct retained footprint (`allocate_memory`) plus CPU work
+//!   (`busy_for`). The footprint is controllable via `length`.
+//! - `throughput-saturation-echo-rust` / `throughput-saturation-echo-ts`: the
+//!   benchmark `echo` agent (Rust / TS) called repeatedly. No synthetic
+//!   footprint — the per-agent memory is the agent's natural footprint, which
+//!   for the TS agent includes the QuickJS runtime. Answers "how many actively
+//!   invoked echo agents fit per pod".
+//!
+//! Parameters:
+//! - `size`   = number of active agents in this step (the ramp axis).
+//! - `length` = for the counters variant, the base per-agent memory footprint in
+//!   bytes (agent `i` retains a deterministic multiple); ignored by the echo
+//!   variants.
+
+use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent};
+use async_trait::async_trait;
+use futures_concurrency::future::Join;
+use golem_common::base_model::agent::{DataValue, ParsedAgentId};
+use golem_common::model::AgentId;
+use golem_common::model::component::ComponentDto;
+use golem_common::model::environment::EnvironmentId;
+use golem_common::{agent_id, data_value};
+use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, ResultKey, RunConfig};
+use golem_test_framework::config::benchmark::TestMode;
+use golem_test_framework::config::dsl_impl::TestUserContext;
+use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies};
+use golem_test_framework::dsl::{TestDsl, TestDslExtended};
+use indoc::indoc;
+use std::time::{Duration, Instant};
+use tracing::{Instrument, Level, info};
+
+/// Number of distinct footprint buckets the synthetic per-agent memory spread
+/// cycles through, so the fleet holds a mix of sizes rather than a uniform
+/// amount.
+const SPREAD_BUCKETS: usize = 8;
+
+/// CPU busy time (ms) per `busy_for` invocation (counters variant only).
+const BUSY_MILLIS: u32 = 50;
+
+/// Idle gap each agent sleeps between calls. During this gap the agent has no
+/// in-flight work and becomes a `LoadedIdle` eviction candidate. Under memory
+/// pressure it may be evicted and then must reload on its next call — the churn
+/// this benchmark exists to measure.
+const IDLE_GAP: Duration = Duration::from_millis(200);
+
+/// Total measured wall-clock duration of the sustained-load phase. Throughput
+/// and churn are measured over this fixed window so steps with different `size`
+/// are comparable. Held long enough that the high-residency plateau persists for
+/// at least a minute, so steady-state behaviour at the memory ceiling (not just
+/// the initial burst) is observed.
+const RUN_DURATION: Duration = Duration::from_secs(90);
+
+/// Maximum per-agent start stagger, so the fleet is not synchronised: at any
+/// instant some agents are mid-call (demanding memory) while others sit idle
+/// (evictable).
+const MAX_STAGGER: Duration = Duration::from_millis(250);
+
+/// Resident memory (bytes) the synthetic-footprint agent `index` retains for a
+/// given `base`. Spreads deterministically across [`SPREAD_BUCKETS`] buckets
+/// (`base * 1` .. `base * SPREAD_BUCKETS`) so different agents hold different
+/// amounts and some sit much closer to the limit than others.
+fn agent_memory_bytes(index: usize, base: usize) -> u32 {
+    let bucket = (index % SPREAD_BUCKETS) + 1;
+    (base.saturating_mul(bucket)).min(u32::MAX as usize) as u32
+}
+
+/// Per-agent start offset derived deterministically from the index, spread
+/// across `[0, MAX_STAGGER)`.
+fn agent_stagger(index: usize) -> Duration {
+    let frac = (index as u32).wrapping_mul(2_654_435_761) % 1000;
+    MAX_STAGGER.checked_mul(frac).unwrap_or_default() / 1000
+}
+
+/// Describes one saturation variant: which component to load, which agent type
+/// and method to actively invoke, and whether to pre-load a synthetic footprint.
+struct SaturationVariant {
+    /// WASM file name (without `.wasm`) in the component directory.
+    wasm_name: &'static str,
+    /// Registry display name for the component.
+    component_name: &'static str,
+    /// Agent type to instantiate.
+    agent_type: &'static str,
+    /// Method invoked repeatedly during the measured phase.
+    active_method: &'static str,
+    /// Builds the parameter for one `active_method` call.
+    active_params: fn() -> DataValue,
+    /// When set, each agent calls this method once in warmup with its
+    /// deterministic footprint (`allocate_memory`-style). `None` for the echo
+    /// variants, whose footprint is the agent's natural memory.
+    allocate_method: Option<&'static str>,
+}
+
+const COUNTERS_VARIANT: SaturationVariant = SaturationVariant {
+    wasm_name: "it_agent_counters_release",
+    component_name: "it:agent-counters",
+    agent_type: "Counter",
+    active_method: "busy_for",
+    active_params: || data_value!(BUSY_MILLIS),
+    allocate_method: Some("allocate_memory"),
+};
+
+const ECHO_RUST_VARIANT: SaturationVariant = SaturationVariant {
+    wasm_name: "benchmark_agent_rust_release",
+    component_name: "benchmark:agent-rust",
+    agent_type: "RustBenchmarkAgent",
+    active_method: "echo",
+    active_params: || data_value!("saturation"),
+    allocate_method: None,
+};
+
+const ECHO_TS_VARIANT: SaturationVariant = SaturationVariant {
+    wasm_name: "benchmark_agent_ts",
+    component_name: "benchmark:agent-ts",
+    agent_type: "BenchmarkAgent",
+    active_method: "echo",
+    active_params: || data_value!("saturation"),
+    allocate_method: None,
+};
+
+pub struct SaturationBenchmarkContext {
+    deps: BenchmarkTestDependencies,
+}
+
+pub struct SaturationIterationContext {
+    user: TestUserContext<BenchmarkTestDependencies>,
+    component: ComponentDto,
+    agent_ids: Vec<ParsedAgentId>,
+    base_memory_bytes: usize,
+    env_id: EnvironmentId,
+}
+
+/// Shared implementation for all saturation variants. The variant-specific
+/// config is supplied by the wrapper types' `variant()`.
+async fn create_context(
+    mode: &TestMode,
+    verbosity: Level,
+    cluster_size: usize,
+    disable_compilation_cache: bool,
+    otlp: bool,
+) -> SaturationBenchmarkContext {
+    SaturationBenchmarkContext {
+        deps: BenchmarkTestDependencies::new(
+            mode,
+            verbosity,
+            cluster_size,
+            disable_compilation_cache,
+            otlp,
+        )
+        .await,
+    }
+}
+
+async fn setup_iteration(
+    variant: &SaturationVariant,
+    config: &RunConfig,
+    benchmark_context: &SaturationBenchmarkContext,
+) -> SaturationIterationContext {
+    let user = benchmark_context.deps.user().await.unwrap();
+    let (_, env) = user.app_and_env().await.unwrap();
+
+    info!("Registering component {}", variant.component_name);
+    let component = user
+        .component(&env.id, variant.wasm_name)
+        .name(variant.component_name)
+        .store()
+        .await
+        .unwrap();
+
+    let mut agent_ids = vec![];
+    for n in 0..config.size {
+        agent_ids.push(agent_id!(variant.agent_type, format!("saturation-{n}")));
+    }
+
+    SaturationIterationContext {
+        user,
+        component,
+        agent_ids,
+        base_memory_bytes: config.length,
+        env_id: env.id,
+    }
+}
+
+async fn warmup(variant: &SaturationVariant, context: &SaturationIterationContext) {
+    let Some(allocate_method) = variant.allocate_method else {
+        // Echo variants: nothing to pre-load; the agent's natural footprint is
+        // established on first invocation.
+        return;
+    };
+
+    async {
+        let base = context.base_memory_bytes;
+        let result_futures = context
+            .agent_ids
+            .iter()
+            .enumerate()
+            .map(move |(idx, agent_id)| async move {
+                let user_clone = context.user.clone();
+                let bytes = agent_memory_bytes(idx, base);
+                invoke_and_await_agent(
+                    &user_clone,
+                    &context.component,
+                    agent_id,
+                    allocate_method,
+                    data_value!(bytes),
+                )
+                .await
+            })
+            .collect::<Vec<_>>();
+        let _ = result_futures.join().await;
+    }
+    .instrument(tracing::info_span!(
+        "warmup_allocate_memory",
+        agent_count = context.agent_ids.len()
+    ))
+    .await;
+}
+
+async fn run(
+    variant: &SaturationVariant,
+    context: &SaturationIterationContext,
+    recorder: BenchmarkRecorder,
+) {
+    let agent_count = context.agent_ids.len();
+    let deadline = Instant::now() + RUN_DURATION;
+
+    let result_futures = context
+        .agent_ids
+        .iter()
+        .enumerate()
+        .map(|(idx, agent_id)| {
+            let recorder = recorder.clone();
+            async move {
+                let user_clone = context.user.clone();
+
+                tokio::time::sleep(agent_stagger(idx)).await;
+
+                let mut calls = 0u64;
+                while Instant::now() < deadline {
+                    let result = invoke_and_await_agent(
+                        &user_clone,
+                        &context.component,
+                        agent_id,
+                        variant.active_method,
+                        (variant.active_params)(),
+                    )
+                    .await;
+                    result.record(&recorder, "", idx.to_string().as_str());
+                    calls += 1;
+                    tokio::time::sleep(IDLE_GAP).await;
+                }
+                calls
+            }
+        })
+        .collect::<Vec<_>>();
+
+    let started = Instant::now();
+    let per_agent_calls = result_futures.join().await;
+    let elapsed = started.elapsed();
+
+    // Aggregate sustained throughput over the fixed run window. Across `size`
+    // steps, this reveals where added active agents stop adding throughput
+    // (memory saturation / eviction churn dominates) — the knee we are after.
+    let total_calls: u64 = per_agent_calls.iter().sum();
+    let secs = elapsed.as_secs_f64();
+    if secs > 0.0 {
+        let ops_per_sec = (total_calls as f64 / secs).round() as u64;
+        info!(
+            "saturation: {agent_count} agents, {total_calls} calls in {secs:.1}s = {ops_per_sec} ops/sec"
+        );
+        recorder.count(
+            &ResultKey::primary("saturation-throughput-ops-per-sec"),
+            ops_per_sec,
+        );
+    }
+}
+
+async fn cleanup_iteration(context: SaturationIterationContext) {
+    let agent_ids: Vec<AgentId> = context
+        .agent_ids
+        .iter()
+        .filter_map(|agent_id| AgentId::from_agent_id(context.component.id, agent_id).ok())
+        .collect();
+    delete_workers(&context.user, &agent_ids).await;
+    cleanup_user_state(&context.user, &context.env_id).await;
+}
+
+/// Generates a `Benchmark` impl wrapper for a saturation variant.
+macro_rules! saturation_benchmark {
+    ($ty:ident, $bench_name:literal, $variant:expr, $description:literal) => {
+        pub struct $ty {
+            config: RunConfig,
+        }
+
+        #[async_trait]
+        impl Benchmark for $ty {
+            type BenchmarkContext = SaturationBenchmarkContext;
+            type IterationContext = SaturationIterationContext;
+
+            fn name() -> &'static str {
+                $bench_name
+            }
+
+            fn description() -> &'static str {
+                indoc! { $description }
+            }
+
+            async fn create_benchmark_context(
+                mode: &TestMode,
+                verbosity: Level,
+                cluster_size: usize,
+                disable_compilation_cache: bool,
+                otlp: bool,
+            ) -> Self::BenchmarkContext {
+                create_context(
+                    mode,
+                    verbosity,
+                    cluster_size,
+                    disable_compilation_cache,
+                    otlp,
+                )
+                .await
+            }
+
+            async fn cleanup(benchmark_context: Self::BenchmarkContext) {
+                benchmark_context.deps.kill_all().await;
+            }
+
+            async fn create(_mode: &TestMode, config: RunConfig) -> Self {
+                Self { config }
+            }
+
+            async fn setup_iteration(
+                &self,
+                benchmark_context: &Self::BenchmarkContext,
+            ) -> Self::IterationContext {
+                setup_iteration(&$variant, &self.config, benchmark_context).await
+            }
+
+            async fn warmup(
+                &self,
+                _benchmark_context: &Self::BenchmarkContext,
+                context: &Self::IterationContext,
+            ) {
+                warmup(&$variant, context).await
+            }
+
+            async fn run(
+                &self,
+                _benchmark_context: &Self::BenchmarkContext,
+                context: &Self::IterationContext,
+                recorder: BenchmarkRecorder,
+            ) {
+                run(&$variant, context, recorder).await
+            }
+
+            async fn cleanup_iteration(
+                &self,
+                _benchmark_context: &Self::BenchmarkContext,
+                context: Self::IterationContext,
+            ) {
+                cleanup_iteration(context).await
+            }
+        }
+    };
+}
+
+saturation_benchmark!(
+    ThroughputSaturationCounters,
+    "throughput-saturation-counters",
+    COUNTERS_VARIANT,
+    "Ramps `size` active agents that each retain a deterministic, per-agent-distinct
+    synthetic memory footprint (controlled by `length`) and do CPU work, measuring
+    sustained throughput to locate the memory-saturation knee."
+);
+
+saturation_benchmark!(
+    ThroughputSaturationEchoRust,
+    "throughput-saturation-echo-rust",
+    ECHO_RUST_VARIANT,
+    "Ramps `size` actively-invoked Rust `echo` agents to find how many fit resident
+    per pod before eviction churn craters throughput. The per-agent footprint is the
+    agent's natural memory (no synthetic allocation)."
+);
+
+saturation_benchmark!(
+    ThroughputSaturationEchoTs,
+    "throughput-saturation-echo-ts",
+    ECHO_TS_VARIANT,
+    "Ramps `size` actively-invoked TypeScript `echo` agents to find how many fit
+    resident per pod before eviction churn craters throughput. The per-agent
+    footprint is the agent's natural memory, including the QuickJS runtime."
+);
diff --git a/test-components/agent-counters/Cargo.toml b/test-components/agent-counters/Cargo.toml
index c7567da5a5..069f9180f3 100644
--- a/test-components/agent-counters/Cargo.toml
+++ b/test-components/agent-counters/Cargo.toml
@@ -3,6 +3,12 @@ name = "it_agent_counters"
 version = "0.0.1"
  edition = "2024"
 
+# Standalone workspace root: this component is excluded from the golem-oss
+# workspace, and when built nested inside another repo's workspace (e.g. the
+# cloud-perf CI checkout under golem-cloud) cargo would otherwise walk up and
+# attach it to that unrelated workspace. An empty table stops that search.
+[workspace]
+
 [profile.release]
 opt-level = "s"
 lto = true
diff --git a/test-components/agent-counters/src/lib.rs b/test-components/agent-counters/src/lib.rs
index b2ac7d4d44..b14840512d 100644
--- a/test-components/agent-counters/src/lib.rs
+++ b/test-components/agent-counters/src/lib.rs
@@ -3,6 +3,43 @@ pub mod repository;
 
 use golem_rust::{agent_definition, agent_implementation, generate_idempotency_key};
 
+/// Page size used when touching retained memory so the OS backs it with real
+/// resident pages rather than leaving it as untouched (non-resident) reservation.
+const PAGE_SIZE: usize = 4096;
+
+/// Spins doing cheap arithmetic for approximately `millis` milliseconds, polling
+/// the monotonic clock between batches of work rather than on every iteration so
+/// the workload is CPU-bound, not clock-syscall-bound. Returns an accumulated
+/// value so the work cannot be optimised away.
+fn busy_loop(millis: u32) -> u32 {
+    let deadline = std::time::Duration::from_millis(millis as u64);
+    let start = std::time::Instant::now();
+    let mut acc: u32 = 0;
+    loop {
+        for i in 0..10_000u32 {
+            acc = acc.wrapping_add(i).wrapping_mul(31).wrapping_add(7);
+        }
+        if start.elapsed() >= deadline {
+            break;
+        }
+    }
+    acc
+}
+
+/// Grows `buffer` to hold `bytes` and touches one byte per page so the memory
+/// becomes resident (real RSS), not just reserved address space.
+fn retain_memory(buffer: &mut Vec<u8>, bytes: u32) {
+    let bytes = bytes as usize;
+    buffer.clear();
+    buffer.shrink_to_fit();
+    buffer.resize(bytes, 0);
+    let mut page = 0;
+    while page < bytes {
+        buffer[page] = buffer[page].wrapping_add(1);
+        page += PAGE_SIZE;
+    }
+}
+
 #[agent_definition]
 trait Counter {
     fn new(id: String) -> Self;
@@ -10,17 +47,32 @@ trait Counter {
     async fn increment_through_rpc(&mut self) -> u32;
     async fn increment_through_rpc_to_ephemeral(&mut self) -> u32;
     async fn increment_through_rpc_to_ephemeral_phantom(&mut self) -> u32;
+
+    /// Spins for `millis` milliseconds of cheap CPU work, then increments and
+    /// returns the counter. Used to define an "active" agent without making the
+    /// workload oplog-bound on a tight loop.
+    fn busy_for(&mut self, millis: u32) -> u32;
+
+    /// Retains `bytes` of resident linear memory in the agent's state and
+    /// increments the counter. The memory stays resident across invocations so
+    /// the agent contributes a controllable footprint to the executor's pool.
+    fn allocate_memory(&mut self, bytes: u32) -> u32;
 }
 
 struct CounterImpl {
     count: u32,
     id: String,
+    retained: Vec<u8>,
 }
 
 #[agent_implementation]
 impl Counter for CounterImpl {
     fn new(id: String) -> Self {
-        Self { id, count: 0 }
+        Self {
+            id,
+            count: 0,
+            retained: Vec::new(),
+        }
     }
 
     fn increment(&mut self) -> u32 {
@@ -42,29 +94,64 @@ impl Counter for CounterImpl {
         let mut client = EphemeralSingletonCounterClient::new_phantom();
         client.increment().await
     }
+
+    fn busy_for(&mut self, millis: u32) -> u32 {
+        let _ = busy_loop(millis);
+        self.count += 1;
+        self.count
+    }
+
+    fn allocate_memory(&mut self, bytes: u32) -> u32 {
+        retain_memory(&mut self.retained, bytes);
+        self.count += 1;
+        self.count
+    }
 }
 
 #[agent_definition(ephemeral)]
 trait EphemeralCounter {
     fn new(id: String) -> Self;
     fn increment(&mut self) -> u32;
+
+    /// See [`Counter::busy_for`].
+    fn busy_for(&mut self, millis: u32) -> u32;
+
+    /// See [`Counter::allocate_memory`].
+    fn allocate_memory(&mut self, bytes: u32) -> u32;
 }
 
 struct EphemeralCounterImpl {
     count: u32,
     _id: String,
+    retained: Vec<u8>,
 }
 
 #[agent_implementation]
 impl EphemeralCounter for EphemeralCounterImpl {
     fn new(id: String) -> Self {
-        Self { _id: id, count: 0 }
+        Self {
+            _id: id,
+            count: 0,
+            retained: Vec::new(),
+        }
     }
 
     fn increment(&mut self) -> u32 {
         self.count += 1;
         self.count
     }
+
+    fn busy_for(&mut self, millis: u32) -> u32 {
+        let _ = busy_loop(millis);
+        self.count += 1;
+        self.count
+    }
+
+    fn allocate_memory(&mut self, bytes: u32) -> u32 {
+        retain_memory(&mut self.retained, bytes);
+        self.count += 1;
+        self.count
+    }
 }