From 981191f789cba2a74fe2ba7b5e1da96f658b069c Mon Sep 17 00:00:00 2001 From: Kaur Matas <33095685+kmatasfp@users.noreply.github.com> Date: Tue, 2 Jun 2026 13:38:36 -0700 Subject: [PATCH 01/60] feat: cloud-mode TestMode::Cloud for benchmarks with best-effort cleanup (#3596) --- golem-test-framework/src/benchmark/config.rs | 82 ++- golem-test-framework/src/benchmark/mod.rs | 1 + golem-test-framework/src/benchmark/results.rs | 9 + .../component_compilation_service/mod.rs | 1 + .../unavailable.rs | 35 ++ .../src/components/rdb/mod.rs | 1 + .../src/components/rdb/unavailable.rs | 31 + .../src/components/redis/mod.rs | 1 + .../src/components/redis/unavailable.rs | 43 ++ .../src/components/redis_monitor/mod.rs | 1 + .../components/redis_monitor/unavailable.rs | 29 + .../src/components/registry_service/cloud.rs | 167 ++++++ .../src/components/registry_service/mod.rs | 1 + .../src/components/shard_manager/mod.rs | 1 + .../components/shard_manager/unavailable.rs | 56 ++ .../components/worker_executor_cluster/mod.rs | 1 + .../worker_executor_cluster/unavailable.rs | 63 +++ .../src/components/worker_service/cloud.rs | 113 ++++ .../src/components/worker_service/mod.rs | 1 + golem-test-framework/src/config/benchmark.rs | 166 +++++- golem-test-framework/src/config/dsl_impl.rs | 5 +- golem-test-framework/src/config/mod.rs | 14 +- integration-tests/src/benchmarks/all.rs | 220 +++++++- integration-tests/src/benchmarks/cleanup.rs | 529 ++++++++++++++++++ .../src/benchmarks/cold_start_unknown.rs | 25 +- .../src/benchmarks/durability_overhead.rs | 6 +- integration-tests/src/benchmarks/latency.rs | 8 +- integration-tests/src/benchmarks/mod.rs | 10 +- integration-tests/src/benchmarks/sleep.rs | 8 +- .../src/benchmarks/throughput.rs | 204 ++++--- 30 files changed, 1741 insertions(+), 91 deletions(-) create mode 100644 golem-test-framework/src/components/component_compilation_service/unavailable.rs create mode 100644 golem-test-framework/src/components/rdb/unavailable.rs create mode 100644 golem-test-framework/src/components/redis/unavailable.rs create mode 100644 golem-test-framework/src/components/redis_monitor/unavailable.rs create mode 100644 golem-test-framework/src/components/registry_service/cloud.rs create mode 100644 golem-test-framework/src/components/shard_manager/unavailable.rs create mode 100644 golem-test-framework/src/components/worker_executor_cluster/unavailable.rs create mode 100644 golem-test-framework/src/components/worker_service/cloud.rs create mode 100644 integration-tests/src/benchmarks/cleanup.rs diff --git a/golem-test-framework/src/benchmark/config.rs b/golem-test-framework/src/benchmark/config.rs index c011ac65b0..0d172baa24 100644 --- a/golem-test-framework/src/benchmark/config.rs +++ b/golem-test-framework/src/benchmark/config.rs @@ -116,7 +116,7 @@ pub struct BenchmarkSuiteItem { impl BenchmarkSuiteItem { pub fn runs(&self, mode: &TestMode) -> Vec { let cluster_size: Vec = match mode { - TestMode::Provided { .. } => { + TestMode::Provided { .. } | TestMode::Cloud { .. } => { vec![0] } _ => self @@ -163,3 +163,83 @@ impl BenchmarkSuiteItem { res } } + +/// Smoke tests for cloud-mode wiring that do not require running services. +/// +/// For a full end-to-end smoke test that exercises actual HTTP clients, +/// cleanup, and the benchmark API contract, run the binary directly against a +/// local Spawned cluster: +/// +/// ```text +/// cargo run --bin benchmarks -- benchmark cold-start-unknown-small \ +/// --size 1 --iterations 1 --length 0 \ +/// cloud \ +/// --api-url http://localhost:8081 \ +/// --apps-base-domain golem.cloud \ +/// --admin-account-id \ +/// --admin-account-email \ +/// --admin-account-token \ +/// --builtin-plugin-owner-account-id \ +/// --default-plan-id +/// ``` +#[cfg(test)] +mod cloud_mode_smoke { + use super::*; + use test_r::test; + use url::Url; + use uuid::Uuid; + + fn cloud_mode() -> TestMode { + TestMode::Cloud { + api_url: Url::parse("https://release.dev-api.golem.cloud").unwrap(), + apps_base_domain: "apps.dev.golem.cloud".to_string(), + admin_account_token: "test-token".to_string(), + builtin_plugin_owner_account_id: Uuid::nil(), + default_plan_id: Uuid::nil(), + shard_manager_grpc_host: None, + shard_manager_grpc_port: None, + component_directory: "test-components".to_string(), + } + } + + /// Cloud mode always returns exactly one `RunConfig` with `cluster_size=0`, + /// regardless of how many `cluster_size` values the suite item specifies. + #[test] + fn runs_returns_single_cluster_size_zero_run() { + let mode = cloud_mode(); + let item = BenchmarkSuiteItem { + name: "cold-start-unknown-small".to_string(), + iterations: 3, + cluster_size: vec![1, 3, 5], // must be ignored in cloud mode + size: vec![10], + length: vec![100], + disable_compilation_cache: None, + }; + let runs = item.runs(&mode); + assert_eq!(runs.len(), 1, "cloud mode ignores cluster_size variations"); + assert_eq!(runs[0].cluster_size, 0, "cloud mode cluster_size must be 0"); + assert_eq!(runs[0].size, 10); + assert_eq!(runs[0].length, 100); + } + + /// Multiple size and length combinations still expand normally; only + /// `cluster_size` is collapsed. + #[test] + fn runs_expands_size_and_length_but_not_cluster_size() { + let mode = cloud_mode(); + let item = BenchmarkSuiteItem { + name: "latency-small".to_string(), + iterations: 1, + cluster_size: vec![1, 3], + size: vec![5, 10], + length: vec![50, 100], + disable_compilation_cache: None, + }; + let runs = item.runs(&mode); + // 1 (collapsed cluster_size) × 2 sizes × 2 lengths = 4 runs + assert_eq!(runs.len(), 4); + for r in &runs { + assert_eq!(r.cluster_size, 0); + } + } +} diff --git a/golem-test-framework/src/benchmark/mod.rs b/golem-test-framework/src/benchmark/mod.rs index 1f349afddd..fd246b2be7 100644 --- a/golem-test-framework/src/benchmark/mod.rs +++ b/golem-test-framework/src/benchmark/mod.rs @@ -301,6 +301,7 @@ impl BenchmarkApi for B { description: B::description().to_string(), runs, results, + run_id: None, } } } diff --git a/golem-test-framework/src/benchmark/results.rs b/golem-test-framework/src/benchmark/results.rs index 1cb0f329b6..a309d1d5e5 100644 --- a/golem-test-framework/src/benchmark/results.rs +++ b/golem-test-framework/src/benchmark/results.rs @@ -495,6 +495,10 @@ pub struct BenchmarkSuiteResult { pub environment: String, pub version: String, pub timestamp: DateTime, + /// Suite-level run-id. Set in cloud mode to `bench-{run_id}` to allow + /// cross-run correlation and garbage collection of orphaned state. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub run_id: Option, pub results: Vec, } @@ -530,6 +534,7 @@ impl BenchmarkSuiteResult { environment, version: golem_common::golem_version().to_string(), timestamp: Utc::now(), + run_id: None, results: vec![], } } @@ -606,6 +611,10 @@ pub struct BenchmarkResult { pub description: String, pub runs: Vec, pub results: Vec, + /// Suite-level run-id. Set in cloud mode to `bench-{run_id}` to allow + /// cross-run correlation and garbage collection of orphaned state. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub run_id: Option, } impl BenchmarkResult { diff --git a/golem-test-framework/src/components/component_compilation_service/mod.rs b/golem-test-framework/src/components/component_compilation_service/mod.rs index f80d2f84d6..50da099698 100644 --- a/golem-test-framework/src/components/component_compilation_service/mod.rs +++ b/golem-test-framework/src/components/component_compilation_service/mod.rs @@ -21,6 +21,7 @@ use tracing::Level; pub mod provided; pub mod spawned; +pub mod unavailable; #[async_trait] pub trait ComponentCompilationService: Send + Sync { diff --git a/golem-test-framework/src/components/component_compilation_service/unavailable.rs b/golem-test-framework/src/components/component_compilation_service/unavailable.rs new file mode 100644 index 0000000000..fb355cd0b3 --- /dev/null +++ b/golem-test-framework/src/components/component_compilation_service/unavailable.rs @@ -0,0 +1,35 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::ComponentCompilationService; +use async_trait::async_trait; + +/// A `ComponentCompilationService` that is not directly reachable. Used in +/// cloud mode, where it is an internal cluster component with no external +/// exposure. `kill` is a no-op so that `kill_all()` completes; operational +/// methods panic with a clear message. +pub struct UnavailableComponentCompilationService; + +#[async_trait] +impl ComponentCompilationService for UnavailableComponentCompilationService { + fn grpc_host(&self) -> String { + panic!("component_compilation_service() is not available in cloud mode"); + } + + fn grpc_port(&self) -> u16 { + panic!("component_compilation_service() is not available in cloud mode"); + } + + async fn kill(&self) {} +} diff --git a/golem-test-framework/src/components/rdb/mod.rs b/golem-test-framework/src/components/rdb/mod.rs index 5f1b5c7fb8..ce8863c10e 100644 --- a/golem-test-framework/src/components/rdb/mod.rs +++ b/golem-test-framework/src/components/rdb/mod.rs @@ -29,6 +29,7 @@ pub mod docker_mysql; pub mod docker_postgres; pub mod provided_postgres; pub mod sqlite; +pub mod unavailable; #[async_trait] pub trait Rdb: Send + Sync { diff --git a/golem-test-framework/src/components/rdb/unavailable.rs b/golem-test-framework/src/components/rdb/unavailable.rs new file mode 100644 index 0000000000..1df99efe70 --- /dev/null +++ b/golem-test-framework/src/components/rdb/unavailable.rs @@ -0,0 +1,31 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{DbInfo, Rdb}; +use async_trait::async_trait; + +/// An `Rdb` that is not directly reachable. Used in cloud mode, where the +/// database is an internal cluster component with no external exposure. +/// Lifecycle teardown (`kill`) is a no-op so that `kill_all()` completes; +/// operational methods panic with a clear message. +pub struct UnavailableRdb; + +#[async_trait] +impl Rdb for UnavailableRdb { + fn info(&self) -> DbInfo { + panic!("rdb() is not available in cloud mode"); + } + + async fn kill(&self) {} +} diff --git a/golem-test-framework/src/components/redis/mod.rs b/golem-test-framework/src/components/redis/mod.rs index df14595c7a..62346ec293 100644 --- a/golem-test-framework/src/components/redis/mod.rs +++ b/golem-test-framework/src/components/redis/mod.rs @@ -20,6 +20,7 @@ use tracing::info; pub mod provided; pub mod spawned; pub mod spawned_tls; +pub mod unavailable; #[async_trait] pub trait Redis: Send + Sync { diff --git a/golem-test-framework/src/components/redis/unavailable.rs b/golem-test-framework/src/components/redis/unavailable.rs new file mode 100644 index 0000000000..0f24489fe9 --- /dev/null +++ b/golem-test-framework/src/components/redis/unavailable.rs @@ -0,0 +1,43 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::Redis; +use async_trait::async_trait; + +/// A `Redis` that is not directly reachable. Used in cloud mode, where Redis +/// is an internal cluster component with no external exposure. `kill` is a +/// no-op so that `kill_all()` completes; operational methods panic with a +/// clear message. +pub struct UnavailableRedis; + +#[async_trait] +impl Redis for UnavailableRedis { + fn assert_valid(&self) { + panic!("redis() is not available in cloud mode"); + } + + fn private_host(&self) -> String { + panic!("redis() is not available in cloud mode"); + } + + fn private_port(&self) -> u16 { + panic!("redis() is not available in cloud mode"); + } + + fn prefix(&self) -> &str { + panic!("redis() is not available in cloud mode"); + } + + async fn kill(&self) {} +} diff --git a/golem-test-framework/src/components/redis_monitor/mod.rs b/golem-test-framework/src/components/redis_monitor/mod.rs index eb73fe0e0d..2a24665ec5 100644 --- a/golem-test-framework/src/components/redis_monitor/mod.rs +++ b/golem-test-framework/src/components/redis_monitor/mod.rs @@ -13,6 +13,7 @@ // limitations under the License. pub mod spawned; +pub mod unavailable; pub trait RedisMonitor: Send + Sync { fn assert_valid(&self); diff --git a/golem-test-framework/src/components/redis_monitor/unavailable.rs b/golem-test-framework/src/components/redis_monitor/unavailable.rs new file mode 100644 index 0000000000..bdde53d231 --- /dev/null +++ b/golem-test-framework/src/components/redis_monitor/unavailable.rs @@ -0,0 +1,29 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::RedisMonitor; + +/// A `RedisMonitor` that is not directly reachable. Used in cloud mode, where +/// Redis is an internal cluster component with no external exposure. `kill` is +/// a no-op so that `kill_all()` completes; operational methods panic with a +/// clear message. +pub struct UnavailableRedisMonitor; + +impl RedisMonitor for UnavailableRedisMonitor { + fn assert_valid(&self) { + panic!("redis_monitor() is not available in cloud mode"); + } + + fn kill(&self) {} +} diff --git a/golem-test-framework/src/components/registry_service/cloud.rs b/golem-test-framework/src/components/registry_service/cloud.rs new file mode 100644 index 0000000000..79e5d03935 --- /dev/null +++ b/golem-test-framework/src/components/registry_service/cloud.rs @@ -0,0 +1,167 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::RegistryService; +use async_trait::async_trait; +use golem_client::api::RegistryServiceClientLive; +use golem_client::{Context, Security}; +use golem_common::model::account::{AccountEmail, AccountId}; +use golem_common::model::auth::TokenSecret; +use golem_common::model::plan::PlanId; +use std::time::Duration; +use tokio::sync::OnceCell; +use tracing::info; +use url::Url; + +/// Registry-service client for cloud mode. +/// +/// In the deployed Golem environment both registry-service and worker-service +/// are reachable behind a single Gateway API hostname +/// (e.g. `https://release.dev-api.golem.cloud`). This struct holds that shared +/// `api_url`; routing to the correct backend service is done by the Gateway +/// based on URL path. +pub struct CloudRegistryService { + api_url: Url, + admin_token: TokenSecret, + builtin_plugin_owner_account_id: AccountId, + default_plan_id: PlanId, + base_http_client: OnceCell, +} + +impl CloudRegistryService { + pub fn new( + api_url: Url, + admin_token: TokenSecret, + builtin_plugin_owner_account_id: AccountId, + default_plan_id: PlanId, + ) -> Self { + info!("Using cloud API gateway at {api_url}"); + Self { + api_url, + admin_token, + builtin_plugin_owner_account_id, + default_plan_id, + base_http_client: OnceCell::new(), + } + } +} + +/// Constructs the tuned HTTP client for cloud-mode benchmark connections. +/// +/// Settings: large connection pool (1024), 90-second idle timeout, TCP +/// nodelay, and 180-second request timeout. +/// +/// Note: `http2_prior_knowledge()` is deliberately **not** set. Prior +/// knowledge is for h2c (HTTP/2 over plain HTTP). All cloud endpoints are +/// HTTPS, where HTTP/2 is negotiated through ALPN during the TLS handshake +/// (TLS termination happens at Envoy). Setting prior knowledge would bypass +/// ALPN and can cause protocol errors. +pub fn new_cloud_reqwest_client() -> reqwest_middleware::ClientWithMiddleware { + let client = reqwest::ClientBuilder::new() + .pool_max_idle_per_host(1024) + .pool_idle_timeout(Duration::from_secs(90)) + .tcp_nodelay(true) + .timeout(Duration::from_secs(180)) + .build() + .expect("Failed to build cloud HTTP client"); + reqwest_middleware::ClientBuilder::new(client) + .with(reqwest_tracing::TracingMiddleware::default()) + .build() +} + +#[async_trait] +impl RegistryService for CloudRegistryService { + fn http_host(&self) -> String { + self.api_url.host_str().unwrap_or("localhost").to_string() + } + + fn http_port(&self) -> u16 { + self.api_url.port_or_known_default().unwrap_or(443) + } + + fn grpc_host(&self) -> String { + panic!("grpc_host() is not available through the Gateway in cloud mode"); + } + + fn grpc_port(&self) -> u16 { + panic!("grpc_port() is not available through the Gateway in cloud mode"); + } + + fn admin_account_id(&self) -> AccountId { + AccountId(uuid::Uuid::nil()) + } + + fn admin_account_email(&self) -> AccountEmail { + AccountEmail::new(String::new()) + } + + fn admin_account_token(&self) -> TokenSecret { + self.admin_token.clone() + } + + fn builtin_plugin_owner_account_id(&self) -> AccountId { + self.builtin_plugin_owner_account_id + } + + fn default_plan(&self) -> PlanId { + self.default_plan_id + } + + fn low_fuel_plan(&self) -> PlanId { + panic!( + "low_fuel_plan is not supported in cloud mode; \ + the benchmark calling this method requires a local or provided cluster" + ); + } + + fn low_disk_space_plan(&self) -> PlanId { + panic!( + "low_disk_space_plan is not supported in cloud mode; \ + the benchmark calling this method requires a local or provided cluster" + ); + } + + fn low_http_calls_plan(&self) -> PlanId { + panic!( + "low_http_calls_plan is not supported in cloud mode; \ + the benchmark calling this method requires a local or provided cluster" + ); + } + + fn low_rpc_calls_plan(&self) -> PlanId { + panic!( + "low_rpc_calls_plan is not supported in cloud mode; \ + the benchmark calling this method requires a local or provided cluster" + ); + } + + async fn kill(&self) {} + + async fn base_http_client(&self) -> reqwest_middleware::ClientWithMiddleware { + self.base_http_client + .get_or_init(|| async { new_cloud_reqwest_client() }) + .await + .clone() + } + + async fn client(&self, token: &TokenSecret) -> RegistryServiceClientLive { + RegistryServiceClientLive { + context: Context { + client: self.base_http_client().await, + base_url: self.api_url.clone(), + security_token: Security::Bearer(token.secret().to_string()), + }, + } + } +} diff --git a/golem-test-framework/src/components/registry_service/mod.rs b/golem-test-framework/src/components/registry_service/mod.rs index d38f88577d..42b0b9ddfd 100644 --- a/golem-test-framework/src/components/registry_service/mod.rs +++ b/golem-test-framework/src/components/registry_service/mod.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod cloud; pub mod provided; pub mod spawned; diff --git a/golem-test-framework/src/components/shard_manager/mod.rs b/golem-test-framework/src/components/shard_manager/mod.rs index 5245865e4b..67b10051bf 100644 --- a/golem-test-framework/src/components/shard_manager/mod.rs +++ b/golem-test-framework/src/components/shard_manager/mod.rs @@ -14,6 +14,7 @@ pub mod provided; pub mod spawned; +pub mod unavailable; use super::rdb::Rdb; use super::registry_service::RegistryService; diff --git a/golem-test-framework/src/components/shard_manager/unavailable.rs b/golem-test-framework/src/components/shard_manager/unavailable.rs new file mode 100644 index 0000000000..834dfb8d2c --- /dev/null +++ b/golem-test-framework/src/components/shard_manager/unavailable.rs @@ -0,0 +1,56 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::ShardManager; +use async_trait::async_trait; +use golem_common::model::RoutingTable; + +/// A `ShardManager` that is not directly reachable. Used in cloud mode when no +/// shard-manager port-forward is configured; pass `--shard-manager-grpc-host` +/// and `--shard-manager-grpc-port` to use a real `ProvidedShardManager` +/// instead. +/// +/// `kill`/`restart` are no-ops. `get_routing_table()` returns an error so that +/// callers (e.g. the throughput benchmark) can fall back to the unlabeled +/// single-bucket mode. The host/port accessors panic with a clear message. +pub struct UnavailableShardManager; + +#[async_trait] +impl ShardManager for UnavailableShardManager { + fn grpc_host(&self) -> String { + panic!( + "shard_manager() requires --shard-manager-grpc-host and \ + --shard-manager-grpc-port to be configured in cloud mode" + ); + } + + fn grpc_port(&self) -> u16 { + panic!( + "shard_manager() requires --shard-manager-grpc-host and \ + --shard-manager-grpc-port to be configured in cloud mode" + ); + } + + async fn kill(&self) {} + + async fn restart(&self, _number_of_shards_override: Option) {} + + async fn get_routing_table(&self) -> crate::Result { + Err(anyhow::anyhow!( + "shard_manager is not configured in cloud mode; \ + pass --shard-manager-grpc-host and --shard-manager-grpc-port \ + to enable routing table fetch and local/remote RPC labeling" + )) + } +} diff --git a/golem-test-framework/src/components/worker_executor_cluster/mod.rs b/golem-test-framework/src/components/worker_executor_cluster/mod.rs index 2dc8e21745..e1db10b237 100644 --- a/golem-test-framework/src/components/worker_executor_cluster/mod.rs +++ b/golem-test-framework/src/components/worker_executor_cluster/mod.rs @@ -18,6 +18,7 @@ use std::sync::Arc; pub mod provided; pub mod spawned; +pub mod unavailable; #[async_trait] pub trait WorkerExecutorCluster: Send + Sync { diff --git a/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs b/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs new file mode 100644 index 0000000000..53a5cc87be --- /dev/null +++ b/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs @@ -0,0 +1,63 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::components::worker_executor::WorkerExecutor; +use crate::components::worker_executor_cluster::WorkerExecutorCluster; +use async_trait::async_trait; +use std::sync::Arc; + +/// A `WorkerExecutorCluster` whose individual executors are not directly +/// reachable. Used in cloud mode, where executors are internal cluster +/// components with no external exposure. +/// +/// Lifecycle teardown methods (`kill_all`, `restart_all`) are no-ops so that +/// `kill_all()` completes. `is_running()` returns `true` so that +/// `ensure_all_deps_running()` is a no-op. Per-executor operations panic with a +/// clear message. +pub struct UnavailableWorkerExecutorCluster; + +#[async_trait] +impl WorkerExecutorCluster for UnavailableWorkerExecutorCluster { + fn size(&self) -> usize { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + async fn kill_all(&self) {} + + async fn restart_all(&self) {} + + async fn stop(&self, _index: usize) { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + async fn start(&self, _index: usize) { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + fn to_vec(&self) -> Vec> { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + async fn stopped_indices(&self) -> Vec { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + async fn started_indices(&self) -> Vec { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + async fn is_running(&self) -> bool { + true + } +} diff --git a/golem-test-framework/src/components/worker_service/cloud.rs b/golem-test-framework/src/components/worker_service/cloud.rs new file mode 100644 index 0000000000..ceb60f4fbe --- /dev/null +++ b/golem-test-framework/src/components/worker_service/cloud.rs @@ -0,0 +1,113 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::components::registry_service::cloud::new_cloud_reqwest_client; +use crate::components::worker_service::WorkerService; +use async_trait::async_trait; +use golem_client::api::{AgentClientLive, WorkerClientLive}; +use golem_client::{Context, Security}; +use golem_common::model::auth::TokenSecret; +use tokio::sync::OnceCell; +use tracing::info; +use url::Url; + +/// Worker-service client for cloud mode. +/// +/// In the deployed Golem environment both registry-service and worker-service +/// are reachable behind a single Gateway API hostname +/// (e.g. `https://release.dev-api.golem.cloud`). This struct holds that shared +/// `api_url`; routing to worker-service is done by the Gateway based on URL +/// path (`/v1/components/*/workers/**`, `/v1/agents/**`). +pub struct CloudWorkerService { + api_url: Url, + base_http_client: OnceCell, +} + +impl CloudWorkerService { + pub fn new(api_url: Url) -> Self { + info!("Using cloud worker-service via API gateway at {api_url}"); + Self { + api_url, + base_http_client: OnceCell::new(), + } + } +} + +#[async_trait] +impl WorkerService for CloudWorkerService { + fn http_host(&self) -> String { + self.api_url.host_str().unwrap_or("localhost").to_string() + } + + fn http_port(&self) -> u16 { + self.api_url.port_or_known_default().unwrap_or(443) + } + + fn grpc_host(&self) -> String { + panic!("grpc_host() is not available through the Gateway in cloud mode"); + } + + fn gprc_port(&self) -> u16 { + panic!("gprc_port() is not available through the Gateway in cloud mode"); + } + + fn custom_request_host(&self) -> String { + // Code-first HTTP API deployments are reached via the apps base domain + // (*.apps.dev.golem.cloud), not through this host. + panic!("custom_request_host() is not available in cloud mode"); + } + + fn custom_request_port(&self) -> u16 { + // Code-first HTTP API deployments are reached via the apps base domain + // (*.apps.dev.golem.cloud), not through this port. + panic!("custom_request_port() is not available in cloud mode"); + } + + fn mcp_port(&self) -> u16 { + panic!("mcp_port() is not available in cloud mode"); + } + + async fn kill(&self) {} + + async fn base_http_client(&self) -> reqwest_middleware::ClientWithMiddleware { + self.base_http_client + .get_or_init(|| async { new_cloud_reqwest_client() }) + .await + .clone() + } + + /// Overrides the trait default to use the configured API gateway URL + /// (including scheme/TLS), rather than rebuilding `http://{host}:{port}`. + async fn worker_http_client(&self, token: &TokenSecret) -> WorkerClientLive { + WorkerClientLive { + context: Context { + client: self.base_http_client().await, + base_url: self.api_url.clone(), + security_token: Security::Bearer(token.secret().to_string()), + }, + } + } + + /// Overrides the trait default to use the configured API gateway URL + /// (including scheme/TLS), rather than rebuilding `http://{host}:{port}`. + async fn agent_http_client(&self, token: &TokenSecret) -> AgentClientLive { + AgentClientLive { + context: Context { + client: self.base_http_client().await, + base_url: self.api_url.clone(), + security_token: Security::Bearer(token.secret().to_string()), + }, + } + } +} diff --git a/golem-test-framework/src/components/worker_service/mod.rs b/golem-test-framework/src/components/worker_service/mod.rs index 6885e86696..126cc988c9 100644 --- a/golem-test-framework/src/components/worker_service/mod.rs +++ b/golem-test-framework/src/components/worker_service/mod.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod cloud; pub mod provided; pub mod spawned; diff --git a/golem-test-framework/src/config/benchmark.rs b/golem-test-framework/src/config/benchmark.rs index a1a304436b..e97a58c47d 100644 --- a/golem-test-framework/src/config/benchmark.rs +++ b/golem-test-framework/src/config/benchmark.rs @@ -16,15 +16,21 @@ use crate::benchmark::BenchmarkConfig; use crate::components::component_compilation_service::ComponentCompilationService; use crate::components::component_compilation_service::provided::ProvidedComponentCompilationService; use crate::components::component_compilation_service::spawned::SpawnedComponentCompilationService; +use crate::components::component_compilation_service::unavailable::UnavailableComponentCompilationService; +use crate::components::rdb::PostgresInfo; +use crate::components::rdb::Rdb; use crate::components::rdb::docker_postgres::DockerPostgresRdb; use crate::components::rdb::provided_postgres::ProvidedPostgresRdb; -use crate::components::rdb::{PostgresInfo, Rdb}; +use crate::components::rdb::unavailable::UnavailableRdb; use crate::components::redis::Redis; use crate::components::redis::provided::ProvidedRedis; use crate::components::redis::spawned::SpawnedRedis; +use crate::components::redis::unavailable::UnavailableRedis; use crate::components::redis_monitor::RedisMonitor; use crate::components::redis_monitor::spawned::SpawnedRedisMonitor; +use crate::components::redis_monitor::unavailable::UnavailableRedisMonitor; use crate::components::registry_service::RegistryService; +use crate::components::registry_service::cloud::CloudRegistryService; use crate::components::registry_service::provided::ProvidedRegistryService; use crate::components::registry_service::spawned::SpawnedRegistryService; use crate::components::service::Service; @@ -32,10 +38,13 @@ use crate::components::service::spawned::SpawnedService; use crate::components::shard_manager::ShardManager; use crate::components::shard_manager::provided::ProvidedShardManager; use crate::components::shard_manager::spawned::SpawnedShardManager; +use crate::components::shard_manager::unavailable::UnavailableShardManager; use crate::components::worker_executor_cluster::WorkerExecutorCluster; use crate::components::worker_executor_cluster::provided::ProvidedWorkerExecutorCluster; use crate::components::worker_executor_cluster::spawned::SpawnedWorkerExecutorCluster; +use crate::components::worker_executor_cluster::unavailable::UnavailableWorkerExecutorCluster; use crate::components::worker_service::WorkerService; +use crate::components::worker_service::cloud::CloudWorkerService; use crate::components::worker_service::provided::ProvidedWorkerService; use crate::components::worker_service::spawned::SpawnedWorkerService; use crate::config::TestDependencies; @@ -51,11 +60,24 @@ use golem_service_base::storage::blob::BlobStorage; use golem_service_base::storage::blob::fs::FileSystemBlobStorage; use std::collections::HashMap; use std::path::{Path, PathBuf}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use tempfile::TempDir; use tracing::Level; +use url::Url; use uuid::Uuid; +/// Process-level UUID generated on the first cloud-mode benchmark context +/// creation. All cloud contexts within the same binary invocation share this +/// run-id, which is used to prefix account/app/env names +/// (`bench-{run_id}-…`) and written into result JSON metadata. +static CLOUD_BENCH_RUN_ID: OnceLock = OnceLock::new(); + +/// Returns the suite-level run-id if any cloud benchmark context has been +/// created in this process, `None` otherwise. +pub fn cloud_bench_run_id() -> Option { + CLOUD_BENCH_RUN_ID.get().copied() +} + /// Test dependencies created from command line arguments /// /// To be used when a single executable with an async entry point requires @@ -75,6 +97,12 @@ pub struct BenchmarkTestDependencies { component_directory: PathBuf, component_temp_directory: Arc, registry_service: Arc, + /// Set to `Some` in cloud mode. Used to prefix account/app/env names with + /// `bench-{run_id}-` so that orphaned state is traceable. + run_id: Option, + /// The apps base domain for cloud mode (e.g. `apps.golem.cloud`). Used to + /// construct HTTP API deployment domains as `{env_id}.{apps_base_domain}`. + apps_base_domain: Option, } #[derive(Parser, Debug, Clone)] @@ -222,6 +250,54 @@ pub enum TestMode { #[arg(long, default_value = "test-components")] component_directory: String, }, + /// Cloud mode: run benchmarks against a deployed Golem environment via + /// Gateway-API hostnames. No local service processes are spawned. + /// + /// All management API calls (registry-service, worker-service, agents) go + /// through a single Gateway hostname (`--api-url`). HTTP API deployment + /// access (code-first HTTP APIs) goes through `{env_id}.{apps_base_domain}`. + /// + /// For `golem-dev`: + /// `--api-url https://release.dev-api.golem.cloud` + /// `--apps-base-domain apps.dev.golem.cloud` + #[command()] + Cloud { + /// Base URL of the deployed Golem API Gateway. Both registry-service + /// and worker-service paths are routed internally by the Gateway. + /// + /// For the `golem-dev` environment this is + /// `https://release.dev-api.golem.cloud`. + #[arg(long)] + api_url: Url, + /// Wildcard base domain used to build per-environment HTTP API + /// deployment hostnames: `{env_id}.{apps_base_domain}`. + /// + /// For the `golem-dev` environment this is `apps.dev.golem.cloud`. + #[arg(long)] + apps_base_domain: String, + /// Bearer token for the admin account. Used to create a fresh user + /// account for each benchmark run, which then owns all benchmark state. + #[arg(long)] + admin_account_token: String, + /// UUID of the builtin-plugin-owner account. + #[arg(long)] + builtin_plugin_owner_account_id: Uuid, + /// UUID of the default plan on the target cluster. + #[arg(long)] + default_plan_id: Uuid, + /// Optional shard-manager gRPC hostname for a kubectl port-forward + /// (e.g. `localhost`). When set together with + /// `--shard-manager-grpc-port`, the throughput benchmark fetches the + /// routing table and labels RPC pairs as local/remote. + #[arg(long)] + shard_manager_grpc_host: Option, + /// Optional shard-manager gRPC port (e.g. `9090`). + #[arg(long)] + shard_manager_grpc_port: Option, + /// Directory containing test WASM component files. + #[arg(long, default_value = "test-components")] + component_directory: String, + }, } impl BenchmarkTestDependencies { @@ -419,6 +495,8 @@ impl BenchmarkTestDependencies { initial_agent_files_service, component_temp_directory: Arc::new(TempDir::new().unwrap()), registry_service, + run_id: None, + apps_base_domain: None, } } @@ -542,6 +620,8 @@ impl BenchmarkTestDependencies { initial_agent_files_service, component_temp_directory: Arc::new(TempDir::new().unwrap()), registry_service, + run_id: None, + apps_base_domain: None, } } TestMode::Spawned { @@ -590,17 +670,93 @@ impl BenchmarkTestDependencies { ) .await } + TestMode::Cloud { + api_url, + apps_base_domain, + admin_account_token, + builtin_plugin_owner_account_id, + default_plan_id, + shard_manager_grpc_host, + shard_manager_grpc_port, + component_directory, + } => { + let blob_storage = Arc::new( + FileSystemBlobStorage::new( + &std::env::temp_dir().join("golem-bench-blob-storage"), + ) + .await + .unwrap(), + ); + let initial_agent_files_service = + Arc::new(InitialAgentFilesService::new(blob_storage.clone())); + + // Use the process-level run_id (shared across all cloud contexts in + // this process so all benchmarks in a suite carry the same run ID). + let run_id = *CLOUD_BENCH_RUN_ID.get_or_init(Uuid::new_v4); + tracing::info!("Cloud benchmark run_id: {run_id}"); + + // Both registry-service and worker-service are reachable via the + // same Gateway hostname; routing is path-based. + let registry_service: Arc = + Arc::new(CloudRegistryService::new( + api_url.clone(), + TokenSecret::trusted(admin_account_token.clone()), + AccountId(*builtin_plugin_owner_account_id), + PlanId(*default_plan_id), + )); + + let shard_manager: Arc = + match (shard_manager_grpc_host, shard_manager_grpc_port) { + (Some(host), Some(port)) => { + Arc::new(ProvidedShardManager::new(host.clone(), 0, *port)) + } + _ => Arc::new(UnavailableShardManager), + }; + + let worker_service: Arc = + Arc::new(CloudWorkerService::new(api_url.clone())); + + Self { + rdb: Arc::new(UnavailableRdb), + redis: Arc::new(UnavailableRedis), + redis_monitor: Arc::new(UnavailableRedisMonitor), + shard_manager, + component_compilation_service: Arc::new(UnavailableComponentCompilationService), + worker_service, + worker_executor_cluster: Arc::new(UnavailableWorkerExecutorCluster), + component_directory: Path::new(component_directory).to_path_buf(), + blob_storage, + initial_agent_files_service, + component_temp_directory: Arc::new(TempDir::new().unwrap()), + registry_service, + run_id: Some(run_id), + apps_base_domain: Some(apps_base_domain.clone()), + } + } } } - /// Checks if all the spawned dependencies are still running, and if not, panicks + /// Checks if all the spawned dependencies are still running, and if not, panics. /// /// This can be used as a checkpoint in benchmarks to avoid infinite retries. + /// In cloud mode this is a no-op — the cloud cluster is assumed to be + /// managed externally. pub async fn ensure_all_deps_running(&self) { if !self.worker_executor_cluster.is_running().await { panic!("Worker executor process(es) stopped"); } } + + /// Returns the run-id for this benchmark context, if running in cloud mode. + /// Used to prefix accounts/apps/envs with `bench-{run_id}-`. + pub fn run_id(&self) -> Option { + self.run_id + } + + /// Returns the apps base domain for cloud mode (e.g. `apps.golem.cloud`). + pub fn apps_base_domain(&self) -> Option<&str> { + self.apps_base_domain.as_deref() + } } #[async_trait] @@ -652,6 +808,10 @@ impl TestDependencies for BenchmarkTestDependencies { fn registry_service(&self) -> Arc { self.registry_service.clone() } + + fn bench_name_prefix(&self) -> Option { + self.run_id.map(|id| format!("bench-{id}-")) + } } #[allow(dead_code)] diff --git a/golem-test-framework/src/config/dsl_impl.rs b/golem-test-framework/src/config/dsl_impl.rs index b228a5235e..f2d5472175 100644 --- a/golem-test-framework/src/config/dsl_impl.rs +++ b/golem-test-framework/src/config/dsl_impl.rs @@ -883,8 +883,9 @@ impl TestDslExtended for TestUserContext { environment_options: &EnvironmentOptions, ) -> anyhow::Result<(Application, Environment)> { let client = self.registry_service_client().await; - let app_name = ApplicationName(format!("app-{}", Uuid::new_v4())); - let env_name = EnvironmentName(format!("env-{}", Uuid::new_v4())); + let prefix = self.deps.bench_name_prefix().unwrap_or_default(); + let app_name = ApplicationName(format!("{prefix}app-{}", Uuid::new_v4())); + let env_name = EnvironmentName(format!("{prefix}env-{}", Uuid::new_v4())); let application = client .create_application( diff --git a/golem-test-framework/src/config/mod.rs b/golem-test-framework/src/config/mod.rs index f5c14ace60..d8bdbe6b39 100644 --- a/golem-test-framework/src/config/mod.rs +++ b/golem-test-framework/src/config/mod.rs @@ -56,6 +56,13 @@ pub trait TestDependencies: Send + Sync + Clone { fn initial_agent_files_service(&self) -> Arc; fn registry_service(&self) -> Arc; + /// Returns an optional name prefix applied to benchmark-created accounts, + /// applications, and environments. Non-`None` in cloud mode, where the + /// prefix is `bench-{run_id}-` to make orphaned state traceable. + fn bench_name_prefix(&self) -> Option { + None + } + async fn admin(&self) -> TestUserContext where Self: Sized, @@ -82,7 +89,12 @@ pub trait TestDependencies: Send + Sync + Clone { .client(®istry_service.admin_account_token()) .await; - let name = Uuid::new_v4().to_string(); + let uuid = Uuid::new_v4().to_string(); + let name = if let Some(prefix) = self.bench_name_prefix() { + format!("{prefix}{uuid}") + } else { + uuid + }; let account_data = AccountCreation { email: AccountEmail::new(format!("{name}@golem.cloud")), name, diff --git a/integration-tests/src/benchmarks/all.rs b/integration-tests/src/benchmarks/all.rs index 91d972534d..6865ecf6e4 100644 --- a/integration-tests/src/benchmarks/all.rs +++ b/integration-tests/src/benchmarks/all.rs @@ -13,16 +13,28 @@ // limitations under the License. use clap::Parser; +use golem_client::api::RegistryServiceClient; +use golem_common::base_model::agent::ParsedAgentId; +use golem_common::model::AgentId; +use golem_common::model::application::{ApplicationCreation, ApplicationName}; +use golem_common::model::environment::{EnvironmentCreation, EnvironmentName}; +use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{ Benchmark, BenchmarkApi, BenchmarkConfig, BenchmarkResult, BenchmarkSuite, BenchmarkSuiteItem, BenchmarkSuiteResult, }; -use golem_test_framework::config::benchmark::TestMode; -use golem_test_framework::config::{BenchmarkCliParameters, BenchmarkTestDependencies}; +use golem_test_framework::config::benchmark::{TestMode, cloud_bench_run_id}; +use golem_test_framework::config::{ + BenchmarkCliParameters, BenchmarkTestDependencies, TestDependencies, +}; +use golem_test_framework::dsl::{TestDsl, TestDslExtended}; +use integration_tests::benchmarks::{ + cleanup_account, cleanup_user_state, delete_workers, invoke_and_await_agent, +}; use std::collections::BTreeMap; use std::future::Future; use std::pin::Pin; -use tracing::{Level, debug, info}; +use tracing::{Level, debug, info, warn}; type RunFn = Box< dyn for<'a> Fn( @@ -144,7 +156,14 @@ async fn main() { length: length.clone(), disable_compilation_cache: Some(*disable_compilation_cache), }; - let result = f( + + cloud_preflight_warmup( + params.benchmark_config.mode(), + params.service_verbosity(), + params.otlp, + ) + .await; + let mut result = f( params.benchmark_config.mode(), params.service_verbosity(), &item, @@ -152,6 +171,10 @@ async fn main() { params.otlp, ) .await; + // Attach the run_id to result metadata (cloud mode only). + if let Some(run_id) = cloud_bench_run_id() { + result.run_id = Some(format!("bench-{run_id}")); + } if params.json { let str = serde_json::to_string(&result) .expect("Failed to serialize BenchmarkResult"); @@ -174,9 +197,27 @@ async fn main() { let suite: BenchmarkSuite = serde_yaml::from_str(&raw_suite).expect("Failed to parse benchmark suite"); + // Validate every benchmark name up-front so a typo exits immediately + // without running warmup or any prior benchmark. + for benchmark in &suite.benchmarks { + if !benchmarks_by_name.contains_key(benchmark.name.as_str()) { + print_non_existing_benchmark(&mut benchmarks_by_name, &benchmark.name); + // print_non_existing_benchmark calls std::process::exit(1) + unreachable!(); + } + } + + // Pre-flight warmup runs after all names are validated. + cloud_preflight_warmup( + params.benchmark_config.mode(), + params.service_verbosity(), + params.otlp, + ) + .await; + let mut suite_result = BenchmarkSuiteResult::new(&suite.name); for benchmark in suite.benchmarks { - info!("Running {benchmark:?}"); // TODO + info!("Running {benchmark:?}"); if let Some(f) = benchmarks_by_name.get(benchmark.name.as_str()) { let result = f( @@ -188,9 +229,13 @@ async fn main() { ) .await; suite_result.add(result); - } else { - print_non_existing_benchmark(&mut benchmarks_by_name, &benchmark.name); } + // no else: we already validated all names above + } + + // Attach the run_id to result metadata (cloud mode only). + if let Some(run_id) = cloud_bench_run_id() { + suite_result.run_id = Some(format!("bench-{run_id}")); } if let Some(path) = save_to_json { @@ -241,3 +286,164 @@ async fn run_benchmark( ) -> BenchmarkResult { B::run_benchmark(mode, verbosity, item, primary_only, otlp).await } + +// ── Pre-flight warmup constants ─────────────────────────────────────────────── + +/// WASM file name (without `.wasm`) of the component used for warmup +/// invocations. Must be present in `--component-directory`. +const WARMUP_COMPONENT_WASM: &str = "benchmark_agent_rust_release"; +/// Registry display name for the warmup component. +const WARMUP_COMPONENT_NAME: &str = "benchmark:agent-rust"; +/// Agent type whose `echo` method is invoked during warmup. +const WARMUP_AGENT_TYPE: &str = "RustBenchmarkAgent"; +/// Instance ID of the throwaway warmup agent. +const WARMUP_AGENT_INSTANCE: &str = "warmup"; +/// Total wall-clock budget for the 50 warmup invocations. If the budget +/// fires (e.g. the platform is slow to cold-start on the first invocation) +/// a warning is logged and the benchmark continues — warmup is best-effort. +const WARMUP_BUDGET: std::time::Duration = std::time::Duration::from_secs(180); + +/// Pre-flight warmup for cloud mode. Runs once at suite/benchmark start; +/// is a no-op for all non-cloud modes. +/// +/// Executes 50 throwaway `invoke_and_await_agent` calls against a short-lived +/// user/env/component. Each call exercises the full stack: +/// gateway → registry-service (component lookup) → worker-service +/// → worker-executor, warming NLB target-group routing and HTTP/2 sessions at +/// every hop so they don't contaminate the first measured iteration. +/// +/// The entire invocation phase is bounded by a 3-minute timeout. If the +/// timeout fires (e.g. because of a gateway routing issue on the first cold +/// start), a warning is logged and the benchmark continues — warm-up is +/// best-effort. +/// +/// If uploading the warmup component fails (e.g. the file is absent from the +/// component directory), a warning is logged and the agent-invocation phase +/// is skipped; the throwaway account is still cleaned up. +async fn cloud_preflight_warmup(mode: &TestMode, verbosity: Level, otlp: bool) { + if !matches!(mode, TestMode::Cloud { .. }) { + return; + } + + info!("Pre-flight warmup: creating throwaway user/env/component (50 invocations)..."); + + let deps = BenchmarkTestDependencies::new(mode, verbosity, 0, false, otlp).await; + + let user = match deps.user().await { + Ok(u) => u, + Err(e) => { + warn!("Pre-flight warmup: failed to create user (skipping): {e:?}"); + deps.kill_all().await; + return; + } + }; + + let registry_client = user.registry_service_client().await; + let prefix = user.deps.bench_name_prefix().unwrap_or_default(); + + let app = match registry_client + .create_application( + &user.account_id.0, + &ApplicationCreation { + name: ApplicationName(format!("{prefix}app-warmup")), + }, + ) + .await + { + Ok(a) => a, + Err(e) => { + warn!("Pre-flight warmup: failed to create app (skipping): {e:?}"); + cleanup_account(&user).await; + deps.kill_all().await; + return; + } + }; + + let env = match registry_client + .create_environment( + &app.id.0, + &EnvironmentCreation { + name: EnvironmentName(format!("{prefix}env-warmup")), + compatibility_check: false, + version_check: false, + security_overrides: false, + }, + ) + .await + { + Ok(e) => e, + Err(e) => { + warn!("Pre-flight warmup: failed to create env (skipping): {e:?}"); + // delete app explicitly before account (cascading delete is incomplete) + if let Err(del_err) = registry_client + .delete_application(&app.id.0, app.revision.into()) + .await + { + warn!( + "Pre-flight warmup: failed to delete app {} after env-creation \ + failure (best-effort, app may be orphaned): {del_err:?}", + app.id.0 + ); + } + cleanup_account(&user).await; + deps.kill_all().await; + return; + } + }; + + let component = match user + .component(&env.id, WARMUP_COMPONENT_WASM) + .name(WARMUP_COMPONENT_NAME) + .store() + .await + { + Ok(c) => c, + Err(e) => { + warn!( + "Pre-flight warmup: failed to upload warmup component \ + ({WARMUP_COMPONENT_WASM}.wasm) — ensure it exists in the \ + component directory: {e:?}" + ); + cleanup_user_state(&user, &env.id).await; + deps.kill_all().await; + return; + } + }; + + let warmup_agent: ParsedAgentId = agent_id!(WARMUP_AGENT_TYPE, WARMUP_AGENT_INSTANCE); + + // Bound the 50 invocations with a total wall-clock budget. + let invoke_result = tokio::time::timeout(WARMUP_BUDGET, async { + for i in 0..50usize { + let result = invoke_and_await_agent( + &user, + &component, + &warmup_agent, + "echo", + data_value!("warmup"), + ) + .await; + info!( + "Pre-flight warmup invocation {}/50: {}ms", + i + 1, + result.accumulated_time.as_millis() + ); + } + }) + .await; + + if invoke_result.is_err() { + warn!( + "Pre-flight warmup: invocation phase timed out after {}s (continuing anyway)", + WARMUP_BUDGET.as_secs() + ); + } + + if let Ok(worker_id) = AgentId::from_agent_id(component.id, &warmup_agent) { + delete_workers(&user, &[worker_id]).await; + } + cleanup_user_state(&user, &env.id).await; + deps.kill_all().await; + + info!("Cloud pre-flight warmup complete."); +} diff --git a/integration-tests/src/benchmarks/cleanup.rs b/integration-tests/src/benchmarks/cleanup.rs new file mode 100644 index 0000000000..2047b06c4d --- /dev/null +++ b/integration-tests/src/benchmarks/cleanup.rs @@ -0,0 +1,529 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Cleanup helpers for cloud-perf benchmarks. +//! +//! The [`CleanupClient`] trait is the narrow interface used by the cascading +//! cleanup logic, which enables unit-testing with the [`MockCleanupClient`] +//! below. + +use async_trait::async_trait; +use golem_client::api::RegistryServiceClient; +use golem_common::model::environment::EnvironmentId; +use golem_test_framework::config::dsl_impl::TestUserContext; +use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies}; +use tracing::warn; +use uuid::Uuid; + +// ── Narrow trait ───────────────────────────────────────────────────────────── + +/// Narrow client interface covering only the operations used by the cascading +/// cleanup helpers. Use [`RegistryCleanupAdapter`] to wrap a real client and +/// [`MockCleanupClient`] (in tests) to inject failures. +#[async_trait] +pub trait CleanupClient: Send + Sync { + /// Returns `(component_id, revision)` pairs for all components in the env. + async fn list_env_components(&self, env_id: &Uuid) -> anyhow::Result>; + async fn delete_component(&self, id: &Uuid, revision: u64) -> anyhow::Result<()>; + + /// Returns domain-registration IDs for the env. + async fn list_env_domain_registrations(&self, env_id: &Uuid) -> anyhow::Result>; + async fn delete_domain_registration(&self, id: &Uuid) -> anyhow::Result<()>; + + /// Returns `(application_id, env_revision)` for the environment. + async fn get_env_app_id_and_revision(&self, env_id: &Uuid) -> anyhow::Result<(Uuid, u64)>; + async fn delete_environment(&self, env_id: &Uuid, revision: u64) -> anyhow::Result<()>; + + /// Returns the application's current revision. + async fn get_application_revision(&self, app_id: &Uuid) -> anyhow::Result; + async fn delete_application(&self, app_id: &Uuid, revision: u64) -> anyhow::Result<()>; + + /// Returns the account's current revision. + async fn get_account_revision(&self, account_id: &Uuid) -> anyhow::Result; + async fn delete_account(&self, account_id: &Uuid, revision: u64) -> anyhow::Result<()>; +} + +// ── Real adapter ───────────────────────────────────────────────────────────── + +/// Wraps any `RegistryServiceClient` implementor and bridges it to +/// [`CleanupClient`]. +pub struct RegistryCleanupAdapter { + inner: C, +} + +impl RegistryCleanupAdapter { + pub fn new(inner: C) -> Self { + Self { inner } + } +} + +#[async_trait] +impl CleanupClient for RegistryCleanupAdapter { + async fn list_env_components(&self, env_id: &Uuid) -> anyhow::Result> { + let page = self + .inner + .list_environment_components(env_id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; + Ok(page + .values + .into_iter() + .map(|c| (c.id.0, c.revision.into())) + .collect()) + } + + async fn delete_component(&self, id: &Uuid, revision: u64) -> anyhow::Result<()> { + self.inner + .delete_component(id, revision) + .await + .map_err(|e| anyhow::anyhow!("{e:?}")) + } + + async fn list_env_domain_registrations(&self, env_id: &Uuid) -> anyhow::Result> { + let page = self + .inner + .list_environment_domain_registrations(env_id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; + Ok(page.values.into_iter().map(|dr| dr.id.0).collect()) + } + + async fn delete_domain_registration(&self, id: &Uuid) -> anyhow::Result<()> { + self.inner + .delete_domain_registration(id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}")) + } + + async fn get_env_app_id_and_revision(&self, env_id: &Uuid) -> anyhow::Result<(Uuid, u64)> { + let env = self + .inner + .get_environment(env_id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; + Ok((env.application_id.0, env.revision.into())) + } + + async fn delete_environment(&self, env_id: &Uuid, revision: u64) -> anyhow::Result<()> { + self.inner + .delete_environment(env_id, revision) + .await + .map_err(|e| anyhow::anyhow!("{e:?}")) + } + + async fn get_application_revision(&self, app_id: &Uuid) -> anyhow::Result { + let app = self + .inner + .get_application(app_id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; + Ok(app.revision.into()) + } + + async fn delete_application(&self, app_id: &Uuid, revision: u64) -> anyhow::Result<()> { + self.inner + .delete_application(app_id, revision) + .await + .map_err(|e| anyhow::anyhow!("{e:?}")) + } + + async fn get_account_revision(&self, account_id: &Uuid) -> anyhow::Result { + let account = self + .inner + .get_account(account_id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; + Ok(account.revision.into()) + } + + async fn delete_account(&self, account_id: &Uuid, revision: u64) -> anyhow::Result<()> { + self.inner + .delete_account(account_id, revision) + .await + .map(|_| ()) + .map_err(|e| anyhow::anyhow!("{e:?}")) + } +} + +// ── Core cleanup logic (testable via CleanupClient) ─────────────────────────── + +/// Steps 1–4 of the cascading cleanup: components → domain registrations → +/// environment → application. Does **not** delete the account. +/// +/// Every step is best-effort: failures are warned and cleanup continues. +/// +/// **Note:** Server-side cascading delete is incomplete (golemcloud/golem#3291). +pub async fn cleanup_env_and_app_with(client: &dyn CleanupClient, env_id: &Uuid) { + // Step 1: components + match client.list_env_components(env_id).await { + Ok(components) => { + for (cid, rev) in components { + if let Err(e) = client.delete_component(&cid, rev).await { + warn!("cleanup: delete component {cid} failed (best-effort): {e:?}"); + } + } + } + Err(e) => warn!("cleanup: list components for env {env_id} failed (best-effort): {e:?}"), + } + + // Step 2: domain registrations + match client.list_env_domain_registrations(env_id).await { + Ok(ids) => { + for id in ids { + if let Err(e) = client.delete_domain_registration(&id).await { + warn!("cleanup: delete domain registration {id} failed (best-effort): {e:?}"); + } + } + } + Err(e) => { + warn!( + "cleanup: list domain registrations for env {env_id} failed \ + (best-effort): {e:?}" + ) + } + } + + // Step 3: environment (also captures app_id for step 4) + let app_id = match client.get_env_app_id_and_revision(env_id).await { + Ok((app_id, rev)) => { + if let Err(e) = client.delete_environment(env_id, rev).await { + warn!("cleanup: delete environment {env_id} failed (best-effort): {e:?}"); + } + Some(app_id) + } + Err(e) => { + warn!("cleanup: get environment {env_id} failed (best-effort): {e:?}"); + None + } + }; + + // Step 4: application (only when app_id is known from step 3) + if let Some(app_id) = app_id { + match client.get_application_revision(&app_id).await { + Ok(rev) => { + if let Err(e) = client.delete_application(&app_id, rev).await { + warn!("cleanup: delete application {app_id} failed (best-effort): {e:?}"); + } + } + Err(e) => { + warn!("cleanup: get application {app_id} failed (best-effort): {e:?}") + } + } + } +} + +/// Step 5 of the cascading cleanup: deletes the user account. +pub async fn cleanup_account_with(client: &dyn CleanupClient, account_id: &Uuid) { + match client.get_account_revision(account_id).await { + Ok(rev) => { + if let Err(e) = client.delete_account(account_id, rev).await { + warn!("cleanup: delete account {account_id} failed (best-effort): {e:?}"); + } + } + Err(e) => { + warn!("cleanup: get account {account_id} failed (best-effort): {e:?}") + } + } +} + +// ── High-level wrappers (take a TestUserContext) ────────────────────────────── + +/// Steps 1–4: components, domain registrations, environment, application. +/// +/// For benchmarks whose iterations create one user with multiple envs/apps +/// (e.g. cold-start-unknown), call this once per env then call +/// [`cleanup_account`] once at the end. +pub async fn cleanup_env_and_app( + user: &TestUserContext, + env_id: &EnvironmentId, +) { + let client = user.deps.registry_service().client(&user.token).await; + let adapter = RegistryCleanupAdapter::new(client); + cleanup_env_and_app_with(&adapter, &env_id.0).await; +} + +/// Step 5: deletes the user account. +pub async fn cleanup_account(user: &TestUserContext) { + let client = user.deps.registry_service().client(&user.token).await; + let adapter = RegistryCleanupAdapter::new(client); + cleanup_account_with(&adapter, &user.account_id.0).await; +} + +/// Convenience wrapper for the common single-env-per-user case: +/// [`cleanup_env_and_app`] followed by [`cleanup_account`]. +pub async fn cleanup_user_state( + user: &TestUserContext, + env_id: &EnvironmentId, +) { + cleanup_env_and_app(user, env_id).await; + cleanup_account(user).await; +} + +// ── Unit tests ──────────────────────────────────────────────────────────────── + +#[cfg(test)] +pub mod tests { + use super::*; + use std::collections::HashSet; + use std::sync::{Arc, Mutex}; + use test_r::test; + + fn block_on(f: F) -> F::Output { + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(f) + } + + /// In-process mock that records every operation attempted and fails the + /// operations listed in `fail_ops`. + pub struct MockCleanupClient { + fail_ops: HashSet<&'static str>, + /// Ordered log of every operation attempted. + pub calls: Arc>>, + /// The `application_id` returned by `get_env_app_id_and_revision` + /// (used to verify step-4 precondition propagation in tests). + pub app_id: Uuid, + } + + impl MockCleanupClient { + pub fn new(fail_ops: &[&'static str]) -> (Self, Arc>>) { + let calls = Arc::new(Mutex::new(Vec::new())); + let mock = Self { + fail_ops: fail_ops.iter().copied().collect(), + calls: calls.clone(), + app_id: Uuid::new_v4(), + }; + (mock, calls) + } + + fn record(&self, name: &'static str) { + self.calls.lock().unwrap().push(name); + } + + fn result(&self, name: &'static str) -> anyhow::Result<()> { + self.record(name); + if self.fail_ops.contains(name) { + Err(anyhow::anyhow!("simulated failure in {name}")) + } else { + Ok(()) + } + } + } + + #[async_trait] + impl CleanupClient for MockCleanupClient { + async fn list_env_components(&self, _: &Uuid) -> anyhow::Result> { + self.record("list_env_components"); + if self.fail_ops.contains("list_env_components") { + Err(anyhow::anyhow!("simulated failure")) + } else { + Ok(vec![(Uuid::new_v4(), 0)]) + } + } + + async fn delete_component(&self, _: &Uuid, _: u64) -> anyhow::Result<()> { + self.result("delete_component") + } + + async fn list_env_domain_registrations(&self, _: &Uuid) -> anyhow::Result> { + self.record("list_env_domain_registrations"); + if self.fail_ops.contains("list_env_domain_registrations") { + Err(anyhow::anyhow!("simulated failure")) + } else { + Ok(vec![Uuid::new_v4()]) + } + } + + async fn delete_domain_registration(&self, _: &Uuid) -> anyhow::Result<()> { + self.result("delete_domain_registration") + } + + async fn get_env_app_id_and_revision(&self, _: &Uuid) -> anyhow::Result<(Uuid, u64)> { + self.record("get_env_app_id_and_revision"); + if self.fail_ops.contains("get_env_app_id_and_revision") { + Err(anyhow::anyhow!("simulated failure")) + } else { + Ok((self.app_id, 1)) + } + } + + async fn delete_environment(&self, _: &Uuid, _: u64) -> anyhow::Result<()> { + self.result("delete_environment") + } + + async fn get_application_revision(&self, _: &Uuid) -> anyhow::Result { + self.record("get_application_revision"); + if self.fail_ops.contains("get_application_revision") { + Err(anyhow::anyhow!("simulated failure")) + } else { + Ok(1) + } + } + + async fn delete_application(&self, _: &Uuid, _: u64) -> anyhow::Result<()> { + self.result("delete_application") + } + + async fn get_account_revision(&self, _: &Uuid) -> anyhow::Result { + self.record("get_account_revision"); + if self.fail_ops.contains("get_account_revision") { + Err(anyhow::anyhow!("simulated failure")) + } else { + Ok(1) + } + } + + async fn delete_account(&self, _: &Uuid, _: u64) -> anyhow::Result<()> { + self.result("delete_account") + } + } + + // ── Test helpers ────────────────────────────────────────────────────────── + + fn all_ops() -> Vec<&'static str> { + vec![ + "list_env_components", + "delete_component", + "list_env_domain_registrations", + "delete_domain_registration", + "get_env_app_id_and_revision", + "delete_environment", + "get_application_revision", + "delete_application", + "get_account_revision", + "delete_account", + ] + } + + fn run(mock: &MockCleanupClient) { + let env_id = Uuid::new_v4(); + let account_id = Uuid::new_v4(); + block_on(async { + cleanup_env_and_app_with(mock, &env_id).await; + cleanup_account_with(mock, &account_id).await; + }); + } + + fn contains(calls: &[&str], op: &str) -> bool { + calls.contains(&op) + } + + // ── Tests ───────────────────────────────────────────────────────────────── + + #[test] + fn all_steps_run_on_success() { + let (mock, calls) = MockCleanupClient::new(&[]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + for op in all_ops() { + assert!( + contains(&calls, op), + "expected '{op}' to be called; got: {calls:?}" + ); + } + } + + #[test] + fn step1_list_failure_continues() { + let (mock, calls) = MockCleanupClient::new(&["list_env_components"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!( + contains(&calls, "list_env_domain_registrations"), + "{calls:?}" + ); + assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}"); + assert!(contains(&calls, "get_account_revision"), "{calls:?}"); + } + + #[test] + fn step2_list_failure_continues() { + let (mock, calls) = MockCleanupClient::new(&["list_env_domain_registrations"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}"); + assert!(contains(&calls, "get_account_revision"), "{calls:?}"); + } + + /// `get_env_app_id_and_revision` (step 3 get) fails → step 4 is skipped + /// (no app_id available) but step 5 still runs. + #[test] + fn step3_get_failure_skips_step4_runs_step5() { + let (mock, calls) = MockCleanupClient::new(&["get_env_app_id_and_revision"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!( + !contains(&calls, "get_application_revision"), + "step 4 must be skipped when step 3 get fails; got: {calls:?}" + ); + assert!( + contains(&calls, "get_account_revision"), + "step 5 must still run; got: {calls:?}" + ); + } + + /// `delete_environment` fails but get succeeded, so app_id is available: + /// step 4 and step 5 both run. + #[test] + fn step3_delete_failure_still_runs_step4_and_step5() { + let (mock, calls) = MockCleanupClient::new(&["delete_environment"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!(contains(&calls, "get_application_revision"), "{calls:?}"); + assert!(contains(&calls, "get_account_revision"), "{calls:?}"); + } + + #[test] + fn step4_failure_continues_to_step5() { + let (mock, calls) = MockCleanupClient::new(&["get_application_revision"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!( + contains(&calls, "get_account_revision"), + "step 5 should run after step 4 failure; got: {calls:?}" + ); + } + + /// `get_account_revision` (step 5 get) fails → function completes without + /// panic and `delete_account` is not attempted. + #[test] + fn step5_get_failure_no_delete_and_completes() { + let (mock, calls) = MockCleanupClient::new(&["get_account_revision"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!(contains(&calls, "get_account_revision"), "{calls:?}"); + assert!( + !contains(&calls, "delete_account"), + "delete_account must not run when get fails; got: {calls:?}" + ); + } + + /// All steps fail simultaneously — function completes without panic and + /// every unconditional step is attempted. + #[test] + fn all_steps_fail_no_short_circuit() { + let (mock, calls) = MockCleanupClient::new(&all_ops()); + run(&mock); // must not panic + let calls = calls.lock().unwrap().clone(); + assert!(contains(&calls, "list_env_components"), "{calls:?}"); + assert!( + contains(&calls, "list_env_domain_registrations"), + "{calls:?}" + ); + assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}"); + assert!(contains(&calls, "get_account_revision"), "{calls:?}"); + } +} diff --git a/integration-tests/src/benchmarks/cold_start_unknown.rs b/integration-tests/src/benchmarks/cold_start_unknown.rs index f29f297658..592b80e2e4 100644 --- a/integration-tests/src/benchmarks/cold_start_unknown.rs +++ b/integration-tests/src/benchmarks/cold_start_unknown.rs @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::benchmarks::{delete_workers, invoke_and_await_agent}; +use crate::benchmarks::{ + cleanup_account, cleanup_env_and_app, delete_workers, invoke_and_await_agent, +}; use async_trait::async_trait; use futures_concurrency::future::Join; use golem_common::base_model::agent::ParsedAgentId; use golem_common::model::AgentId; use golem_common::model::component::ComponentDto; +use golem_common::model::environment::EnvironmentId; use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig}; use golem_test_framework::config::benchmark::TestMode; @@ -196,6 +199,8 @@ impl Benchmark for ColdStartUnknownMedium { pub struct IterationContext { user: TestUserContext, agents: Vec<(ComponentDto, ParsedAgentId)>, + /// One env_id per size (cold_start creates one env per component). + env_ids: Vec, } pub struct ColdStartUnknownBenchmark { @@ -235,11 +240,13 @@ impl ColdStartUnknownBenchmark { pub async fn setup_iteration(&self, config: &RunConfig) -> IterationContext { let user = self.deps.user().await.unwrap(); let mut agents = vec![]; + let mut env_ids = vec![]; for _ in 0..config.size { // Agent types names are unique within one environment, // so make sure each component get its own env let (_, env) = user.app_and_env().await.unwrap(); + env_ids.push(env.id); let component = user .component(&env.id, &self.component_name) @@ -252,7 +259,11 @@ impl ColdStartUnknownBenchmark { agents.push((component, agent_id)); } - IterationContext { user, agents } + IterationContext { + user, + agents, + env_ids, + } } pub async fn warmup(&self, config: &RunConfig) { @@ -298,6 +309,14 @@ impl ColdStartUnknownBenchmark { .iter() .filter_map(|(component, agent_id)| AgentId::from_agent_id(component.id, agent_id).ok()) .collect(); - delete_workers(&iteration.user, &agent_ids).await + delete_workers(&iteration.user, &agent_ids).await; + // Clean up each env/app individually, then delete the account once. + // This avoids the account being deleted on the first env cleanup and + // causing subsequent cleanup calls to fail (since the user token would + // be invalid after account deletion). + for env_id in &iteration.env_ids { + cleanup_env_and_app(&iteration.user, env_id).await; + } + cleanup_account(&iteration.user).await; } } diff --git a/integration-tests/src/benchmarks/durability_overhead.rs b/integration-tests/src/benchmarks/durability_overhead.rs index f956eb3636..fb864fd44c 100644 --- a/integration-tests/src/benchmarks/durability_overhead.rs +++ b/integration-tests/src/benchmarks/durability_overhead.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::benchmarks::{delete_workers, invoke_and_await_agent}; +use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent}; use async_trait::async_trait; use futures_concurrency::future::Join; use golem_common::base_model::agent::ParsedAgentId; use golem_common::model::AgentId; use golem_common::model::component::{ComponentDto, ComponentId}; +use golem_common::model::environment::EnvironmentId; use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig}; use golem_test_framework::config::benchmark::TestMode; @@ -42,6 +43,7 @@ pub struct DurabilityOverheadIterationContext { durable_nonpersistent_agent_ids: Vec, ephemeral_agent_ids: Vec, durable_persistent_commit_agent_ids: Vec, + env_id: EnvironmentId, } fn agent_ids_to_agent_ids(component_id: ComponentId, agent_ids: &[ParsedAgentId]) -> Vec { @@ -146,6 +148,7 @@ impl Benchmark for DurabilityOverhead { durable_nonpersistent_agent_ids, ephemeral_agent_ids, durable_persistent_commit_agent_ids, + env_id: env.id, } } @@ -336,5 +339,6 @@ impl Benchmark for DurabilityOverhead { ), ) .await; + cleanup_user_state(&context.user, &context.env_id).await; } } diff --git a/integration-tests/src/benchmarks/latency.rs b/integration-tests/src/benchmarks/latency.rs index a44ff42333..006d29f228 100644 --- a/integration-tests/src/benchmarks/latency.rs +++ b/integration-tests/src/benchmarks/latency.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::benchmarks::{delete_workers, invoke_and_await_agent}; +use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent}; use async_trait::async_trait; use futures_concurrency::future::Join; use golem_common::base_model::agent::ParsedAgentId; use golem_common::model::AgentId; use golem_common::model::component::ComponentDto; +use golem_common::model::environment::EnvironmentId; use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig}; use golem_test_framework::config::benchmark::TestMode; @@ -200,6 +201,7 @@ pub struct IterationContext { component: ComponentDto, agent_ids: Vec, length: usize, + env_id: EnvironmentId, } pub struct LatencyBenchmark { @@ -261,6 +263,7 @@ impl LatencyBenchmark { component, agent_ids, length: config.length, + env_id: env.id, } } @@ -326,6 +329,7 @@ impl LatencyBenchmark { .iter() .filter_map(|agent_id| AgentId::from_agent_id(iteration.component.id, agent_id).ok()) .collect(); - delete_workers(&iteration.user, &agent_ids).await + delete_workers(&iteration.user, &agent_ids).await; + cleanup_user_state(&iteration.user, &iteration.env_id).await; } } diff --git a/integration-tests/src/benchmarks/mod.rs b/integration-tests/src/benchmarks/mod.rs index b15dde89a3..d1651f063f 100644 --- a/integration-tests/src/benchmarks/mod.rs +++ b/integration-tests/src/benchmarks/mod.rs @@ -29,15 +29,19 @@ use std::time::{Duration, SystemTime}; use tracing::{Instrument, info, warn}; use tracing_opentelemetry::OpenTelemetrySpanExt; +pub mod cleanup; pub mod cold_start_unknown; pub mod durability_overhead; pub mod latency; pub mod sleep; pub mod throughput; -/// Injects the current tracing span's OpenTelemetry trace context (traceparent/tracestate) -/// into a reqwest Request's headers so that downstream services can link their -/// spans to the benchmark's trace. +// Re-export cleanup helpers so callers can use the flat `benchmarks::*` path. +pub use cleanup::{cleanup_account, cleanup_env_and_app, cleanup_user_state}; + +/// Injects the current tracing span's OpenTelemetry trace context +/// (traceparent/tracestate) into a reqwest Request's headers so that +/// downstream services can link their spans to the benchmark's trace. fn inject_trace_context(request: &mut Request) { let current_span = tracing::Span::current(); let otel_context = current_span.context(); diff --git a/integration-tests/src/benchmarks/sleep.rs b/integration-tests/src/benchmarks/sleep.rs index 97bb64e16f..457872ed29 100644 --- a/integration-tests/src/benchmarks/sleep.rs +++ b/integration-tests/src/benchmarks/sleep.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::benchmarks::delete_workers; +use crate::benchmarks::{cleanup_user_state, delete_workers}; use async_trait::async_trait; use futures_concurrency::future::Join; use golem_common::base_model::agent::ParsedAgentId; use golem_common::model::AgentId; use golem_common::model::component::ComponentDto; +use golem_common::model::environment::EnvironmentId; use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig}; use golem_test_framework::config::benchmark::TestMode; @@ -39,6 +40,7 @@ pub struct SleepIterationContext { user: TestUserContext, component: ComponentDto, agent_ids: Vec, + env_id: EnvironmentId, } #[async_trait] @@ -111,6 +113,7 @@ impl Benchmark for Sleep { user, component, agent_ids, + env_id: env.id, } } @@ -184,6 +187,7 @@ impl Benchmark for Sleep { .iter() .filter_map(|agent_id| AgentId::from_agent_id(context.component.id, agent_id).ok()) .collect(); - delete_workers(&context.user, &agent_ids).await + delete_workers(&context.user, &agent_ids).await; + cleanup_user_state(&context.user, &context.env_id).await; } } diff --git a/integration-tests/src/benchmarks/throughput.rs b/integration-tests/src/benchmarks/throughput.rs index 9cdecd7a1f..5515090847 100644 --- a/integration-tests/src/benchmarks/throughput.rs +++ b/integration-tests/src/benchmarks/throughput.rs @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::benchmarks::{delete_workers, invoke_and_await_agent, invoke_and_await_http}; +use crate::benchmarks::{ + cleanup_user_state, delete_workers, invoke_and_await_agent, invoke_and_await_http, +}; use async_trait::async_trait; use axum::http::{HeaderMap, HeaderValue}; use futures_concurrency::future::Join; @@ -21,6 +23,7 @@ use golem_common::base_model::agent::{DataValue, ParsedAgentId}; use golem_common::model::agent::AgentTypeName; use golem_common::model::component::{ComponentDto, ComponentId}; use golem_common::model::domain_registration::{Domain, DomainRegistrationCreation}; +use golem_common::model::environment::EnvironmentId; use golem_common::model::http_api_deployment::{ HttpApiDeploymentAgentOptions, HttpApiDeploymentCreation, }; @@ -79,16 +82,14 @@ impl Benchmark for ThroughputEcho { "echo", "echo", Box::new(|_| data_value!("benchmark")), - Box::new(|port, idx, _length| { - let url = Url::parse(&format!( - "http://localhost:{port}/test-{idx}-http/echo/test-message" - )) - .unwrap(); + Box::new(|base_url, idx, _length| { + let url = + Url::parse(&format!("{base_url}/test-{idx}-http/echo/test-message")).unwrap(); Request::new(Method::POST, url) }), - Box::new(|port, idx, _length| { + Box::new(|base_url, idx, _length| { let url = Url::parse(&format!( - "http://localhost:{port}/rust/test-{idx}-http/echo/test-message" + "{base_url}/rust/test-{idx}-http/echo/test-message" )) .unwrap(); Request::new(Method::POST, url) @@ -179,21 +180,16 @@ impl Benchmark for ThroughputLargeInput { let bytes = vec![0u8; length]; data_value!(bytes) }), - Box::new(|port, idx, length| { - let url = Url::parse(&format!( - "http://localhost:{port}/test-{idx}-http/large-input" - )) - .unwrap(); + Box::new(|base_url, idx, length| { + let url = Url::parse(&format!("{base_url}/test-{idx}-http/large-input")).unwrap(); let json_body = json!({"input": vec![0u8; length]}).to_string(); let mut request = Request::new(Method::POST, url); *request.body_mut() = Some(Body::wrap(json_body)); request }), - Box::new(|port, idx, length| { - let url = Url::parse(&format!( - "http://localhost:{port}/rust/test-{idx}-http/large-input" - )) - .unwrap(); + Box::new(|base_url, idx, length| { + let url = + Url::parse(&format!("{base_url}/rust/test-{idx}-http/large-input")).unwrap(); let json_body = json!({"input": vec![0u8; length]}).to_string(); let mut request = Request::new(Method::POST, url); *request.body_mut() = Some(Body::wrap(json_body)); @@ -282,21 +278,16 @@ impl Benchmark for ThroughputCpuIntensive { "cpu_intensive", "cpuIntensive", Box::new(|length| data_value!(length as f64)), - Box::new(|port, idx, length| { - let url = Url::parse(&format!( - "http://localhost:{port}/test-{idx}-http/cpu-intensive" - )) - .unwrap(); + Box::new(|base_url, idx, length| { + let url = Url::parse(&format!("{base_url}/test-{idx}-http/cpu-intensive")).unwrap(); let json_body = json!({"length": length}).to_string(); let mut request = Request::new(Method::POST, url); *request.body_mut() = Some(Body::wrap(json_body)); request }), - Box::new(|port, idx, length| { - let url = Url::parse(&format!( - "http://localhost:{port}/rust/test-{idx}-http/cpu-intensive" - )) - .unwrap(); + Box::new(|base_url, idx, length| { + let url = + Url::parse(&format!("{base_url}/rust/test-{idx}-http/cpu-intensive")).unwrap(); let json_body = json!({"length": length}).to_string(); let mut request = Request::new(Method::POST, url); *request.body_mut() = Some(Body::wrap(json_body)); @@ -402,14 +393,20 @@ impl AgentInvocationTarget { } } - pub fn prefix(&self, prefix: &str, routing_table: &RoutingTable) -> String { + pub fn prefix(&self, prefix: &str, routing_table: &Option) -> String { match self { AgentInvocationTarget::Single { .. } => prefix.to_string(), AgentInvocationTarget::Pair { pair, .. } => { - if pair.at_same_worker_executor(routing_table) { - format!("{prefix}local-") + if let Some(rt) = routing_table { + if pair.at_same_worker_executor(rt) { + format!("{prefix}local-") + } else { + format!("{prefix}remote-") + } } else { - format!("{prefix}remote-") + // Routing table not available (no shard-manager port-forward + // configured); all RPC pairs go into a single unlabeled bucket. + prefix.to_string() } } } @@ -426,19 +423,35 @@ pub struct IterationContext { rust_agent_ids_for_http: Vec, ts_agent_ids_for_http: Vec, length: usize, - routing_table: RoutingTable, + /// `None` when shard-manager host/port are not configured (cloud mode + /// without port-forward). When `None`, RPC pairs go into a single unlabeled + /// bucket instead of being split into local/remote. + routing_table: Option, ts_rpc_agent_id_pairs: Vec, rust_rpc_agent_id_pairs: Vec, + env_id: EnvironmentId, } +/// Type for HTTP request builder closures used by the throughput benchmark. +/// Receives `(base_url, agent_index, length)` where `base_url` is the full +/// scheme+host+port prefix (e.g. `http://localhost:8084` in local mode or +/// `https://myenv.apps.golem.dev` in cloud mode). +type HttpRequestFn = Box Fn(&'a str, usize, usize) -> Request + Send + Sync + 'static>; + pub struct ThroughputBenchmark { rust_method_name: String, ts_method_name: String, agent_params: Box DataValue + Send + Sync + 'static>, - http_request: Box Request + Send + Sync + 'static>, - rust_http_request: Box Request + Send + Sync + 'static>, + http_request: HttpRequestFn, + rust_http_request: HttpRequestFn, deps: BenchmarkTestDependencies, call_count: usize, + /// Pre-built HTTP client for cloud-mode apps-domain calls + /// (`https://{env_id}.{apps_base_domain}`). Cached here so the + /// connection pool is warm across benchmark iterations. + /// `None` in local/provided mode (client is built per-iteration from the + /// custom-request port with a Host header override). + cloud_http_client: Option, } fn agent_ids_to_agent_ids(component_id: ComponentId, ids: &[ParsedAgentId]) -> Vec { @@ -452,8 +465,8 @@ impl ThroughputBenchmark { rust_method_name: &str, ts_method_name: &str, agent_params: Box DataValue + Send + Sync + 'static>, - http_request: Box Request + Send + Sync + 'static>, - rust_http_request: Box Request + Send + Sync + 'static>, + http_request: HttpRequestFn, + rust_http_request: HttpRequestFn, mode: &TestMode, verbosity: Level, cluster_size: usize, @@ -461,21 +474,40 @@ impl ThroughputBenchmark { call_count: usize, otlp: bool, ) -> Self { + let deps = BenchmarkTestDependencies::new( + mode, + verbosity, + cluster_size, + disable_compilation_cache, + otlp, + ) + .await; + + // Build the cloud HTTP client once so the connection pool stays alive + // across all benchmark iterations. In cloud mode requests go to + // https://{env_id}.{apps_base_domain}, so we use standard TLS with + // ALPN negotiation — NOT http2_prior_knowledge() which is for h2c + // (cleartext HTTP/2) and would bypass the ALPN step that the NLB + // terminating TLS expects. + let cloud_http_client = deps.apps_base_domain().map(|_| { + reqwest::ClientBuilder::new() + .pool_max_idle_per_host(1024) + .pool_idle_timeout(std::time::Duration::from_secs(90)) + .tcp_nodelay(true) + .timeout(std::time::Duration::from_secs(180)) + .build() + .expect("Failed to build cloud HTTP client for throughput benchmark") + }); + Self { rust_method_name: rust_method_name.to_string(), ts_method_name: ts_method_name.to_string(), agent_params, http_request, rust_http_request, - deps: BenchmarkTestDependencies::new( - mode, - verbosity, - cluster_size, - disable_compilation_cache, - otlp, - ) - .await, + deps, call_count, + cloud_http_client, } } @@ -491,13 +523,23 @@ impl ThroughputBenchmark { let mut ts_rpc_agent_id_pairs = vec![]; let mut rust_rpc_agent_id_pairs = vec![]; - let routing_table = self - .deps - .shard_manager() - .get_routing_table() - .await - .expect("Failed to get routing table"); - info!("Fetched routing table: {routing_table}"); + // Fetch routing table when shard-manager is configured; fall back to + // None (unlabeled single-bucket RPC) when not configured (e.g. cloud + // mode without a port-forward to the shard-manager). + let routing_table: Option = + match self.deps.shard_manager().get_routing_table().await { + Ok(rt) => { + info!("Fetched routing table: {rt}"); + Some(rt) + } + Err(err) => { + info!( + "Shard-manager not available, skipping routing table (RPC pairs \ + will be unlabeled): {err:#}" + ); + None + } + }; let user = self.deps.user().await.unwrap(); let (_, env) = user.app_and_env().await.unwrap(); @@ -542,7 +584,14 @@ impl ThroughputBenchmark { let client = user.registry_service_client().await; - let domain = Domain(format!("{}.golem.cloud", env.id)); + // In cloud mode, use the configured apps_base_domain. Fall back to + // "golem.cloud" for local/provided modes. + let apps_base_domain = self + .deps + .apps_base_domain() + .unwrap_or("golem.cloud") + .to_string(); + let domain = Domain(format!("{}.{}", env.id, apps_base_domain)); async { client @@ -605,6 +654,7 @@ impl ThroughputBenchmark { routing_table, ts_rpc_agent_id_pairs, rust_rpc_agent_id_pairs, + env_id: env.id, } } @@ -713,7 +763,7 @@ impl ThroughputBenchmark { pub async fn run(&self, iteration: &IterationContext, recorder: BenchmarkRecorder) { async fn measure_agents( user: &TestUserContext, - routing_table: &RoutingTable, + routing_table: &Option, recorder: &BenchmarkRecorder, length: usize, call_count: usize, @@ -799,31 +849,51 @@ impl ThroughputBenchmark { .instrument(tracing::info_span!("measure_ts_agents")) .await; - let port = self.deps.worker_service().custom_request_port(); - - let client = { - let mut headers = HeaderMap::new(); - headers.insert("Host", HeaderValue::from_str(&iteration.domain.0).unwrap()); - reqwest::Client::builder() - .default_headers(headers) - .build() - .expect("Failed to create HTTP client") - }; + // Resolve the base URL prefix and HTTP client for the code-first HTTP + // API benchmark paths. The request-builder closures append the route + // path (e.g. "/test-0-http/echo/...") to this prefix. + // + // cloud mode: base = "https://{env_id}.apps.dev.golem.cloud" + // → reqwest connects directly to that host (TLS/SNI + + // Host set from the URL); the apps gateway routes it + // to worker-service. Uses the cached, pool-warm client. + // + // local mode: base = "http://localhost:{custom_request_port}" + // → reqwest connects to localhost; an explicit Host + // header ("{env_id}.golem.cloud") tells the local + // worker-service which deployment to route to. + let (http_base_url, client): (String, reqwest::Client) = + if let Some(ref cached) = self.cloud_http_client { + let base = format!("https://{}", iteration.domain.0); + (base, cached.clone()) + } else { + let port = self.deps.worker_service().custom_request_port(); + let base = format!("http://localhost:{port}"); + let mut headers = HeaderMap::new(); + headers.insert("Host", HeaderValue::from_str(&iteration.domain.0).unwrap()); + let c = reqwest::Client::builder() + .default_headers(headers) + .build() + .expect("Failed to create HTTP client"); + (base, c) + }; async { let client = client.clone(); + let base = http_base_url.clone(); let result_futures = iteration .rust_agent_ids_for_http .iter() .enumerate() .map(move |(idx, _agent_id)| { let client = client.clone(); + let base = base.clone(); async move { let mut results = vec![]; for _ in 0..self.call_count { results.push( invoke_and_await_http(client.clone(), || { - (self.rust_http_request)(port, idx, iteration.length) + (self.rust_http_request)(&base, idx, iteration.length) }) .await, ) @@ -850,12 +920,13 @@ impl ThroughputBenchmark { .enumerate() .map(move |(idx, _agent_id)| { let client = client.clone(); + let base = http_base_url.clone(); async move { let mut results = vec![]; for _ in 0..self.call_count { results.push( invoke_and_await_http(client.clone(), || { - (self.http_request)(port, idx, iteration.length) + (self.http_request)(&base, idx, iteration.length) }) .await, ) @@ -969,5 +1040,6 @@ impl ThroughputBenchmark { } } delete_workers(&iteration.user, &rust_rpc_workers).await; + cleanup_user_state(&iteration.user, &iteration.env_id).await; } } From 341bab33e3e55fd2b4214d8f0affd036d5f76c6c Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 2 Jun 2026 22:15:18 -0700 Subject: [PATCH 02/60] feat: add run specific details to perf tests --- golem-test-framework/src/benchmark/mod.rs | 4 +- golem-test-framework/src/benchmark/results.rs | 99 +++++++++++++++++++ .../benchmark_suites/cloud-perf.yaml | 28 ++++++ integration-tests/src/benchmarks/all.rs | 11 ++- 4 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 integration-tests/benchmark_suites/cloud-perf.yaml diff --git a/golem-test-framework/src/benchmark/mod.rs b/golem-test-framework/src/benchmark/mod.rs index fd246b2be7..5e82adde15 100644 --- a/golem-test-framework/src/benchmark/mod.rs +++ b/golem-test-framework/src/benchmark/mod.rs @@ -16,7 +16,9 @@ mod config; mod results; pub use config::{BenchmarkConfig, BenchmarkSuite, BenchmarkSuiteItem, RunConfig}; -pub use results::{BenchmarkResult, BenchmarkRunResult, BenchmarkSuiteResult, ResultKey}; +pub use results::{ + BenchmarkResult, BenchmarkRunResult, BenchmarkSuiteResult, ResultKey, RunMetadata, +}; use crate::config::benchmark::TestMode; use async_trait::async_trait; diff --git a/golem-test-framework/src/benchmark/results.rs b/golem-test-framework/src/benchmark/results.rs index a309d1d5e5..afb7319a7d 100644 --- a/golem-test-framework/src/benchmark/results.rs +++ b/golem-test-framework/src/benchmark/results.rs @@ -484,6 +484,97 @@ impl Display for BenchmarkResultView { } } +/// Cloud-mode run metadata collected by the buildspec and passed via environment variables. +/// All fields are optional — missing env vars produce `None` rather than failing the run. +#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RunMetadata { + /// The `golem-oss` commit SHA that was built and deployed. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub golem_oss_commit_sha: Option, + /// The `golem-cloud` (kubernetes manifests) commit SHA that was deployed. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub kubernetes_manifest_commit_sha: Option, + /// Number of Ready `worker-executor` pods observed at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub observed_cluster_size: Option, + /// Container image tag of the deployed `worker-executor`. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub worker_executor_image_tag: Option, + /// Container image tag of the deployed `registry-service`. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub registry_service_image_tag: Option, + /// Container image tag of the deployed `worker-service`. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub worker_service_image_tag: Option, + /// Aurora ACU capacity for the main (`golem_dev`) cluster at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub aurora_acu_main: Option, + /// Aurora ACU capacity for the indexed-storage cluster at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub aurora_acu_indexed: Option, + /// Aurora ACU capacity for the keyvalue-storage cluster at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub aurora_acu_keyvalue: Option, + /// Ready replica count for `worker-executor` at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub worker_executor_replicas: Option, + /// Ready replica count for `worker-service` at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub worker_service_replicas: Option, + /// Ready replica count for `registry-service` at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub registry_service_replicas: Option, + /// Ready replica count for `compilation-service` at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub compilation_service_replicas: Option, + /// Ready replica count for `debugging-service` at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub debugging_service_replicas: Option, + /// Free-form note from the `workflow_dispatch` trigger. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub note: Option, +} + +impl RunMetadata { + /// Reads all `GOLEM_BENCH_*` environment variables and returns a populated + /// `RunMetadata`. Missing variables produce `None` for that field. + pub fn from_env() -> Self { + fn env_str(key: &str) -> Option { + std::env::var(key).ok().filter(|v| !v.is_empty()) + } + fn env_u32(key: &str) -> Option { + env_str(key).and_then(|v| v.parse().ok()) + } + fn env_f64(key: &str) -> Option { + env_str(key).and_then(|v| v.parse().ok()) + } + + Self { + golem_oss_commit_sha: env_str("GOLEM_BENCH_OSS_COMMIT_SHA"), + kubernetes_manifest_commit_sha: env_str("GOLEM_BENCH_K8S_MANIFEST_COMMIT_SHA"), + observed_cluster_size: env_u32("GOLEM_BENCH_OBSERVED_CLUSTER_SIZE"), + worker_executor_image_tag: env_str("GOLEM_BENCH_WORKER_EXECUTOR_IMAGE_TAG"), + registry_service_image_tag: env_str("GOLEM_BENCH_REGISTRY_SERVICE_IMAGE_TAG"), + worker_service_image_tag: env_str("GOLEM_BENCH_WORKER_SERVICE_IMAGE_TAG"), + aurora_acu_main: env_f64("GOLEM_BENCH_AURORA_ACU_MAIN"), + aurora_acu_indexed: env_f64("GOLEM_BENCH_AURORA_ACU_INDEXED"), + aurora_acu_keyvalue: env_f64("GOLEM_BENCH_AURORA_ACU_KEYVALUE"), + worker_executor_replicas: env_u32("GOLEM_BENCH_WORKER_EXECUTOR_REPLICAS"), + worker_service_replicas: env_u32("GOLEM_BENCH_WORKER_SERVICE_REPLICAS"), + registry_service_replicas: env_u32("GOLEM_BENCH_REGISTRY_SERVICE_REPLICAS"), + compilation_service_replicas: env_u32("GOLEM_BENCH_COMPILATION_SERVICE_REPLICAS"), + debugging_service_replicas: env_u32("GOLEM_BENCH_DEBUGGING_SERVICE_REPLICAS"), + note: env_str("GOLEM_BENCH_RUN_NOTE"), + } + } + + /// Returns `true` if every field is `None` (nothing was read from env). + pub fn is_empty(&self) -> bool { + self == &Self::default() + } +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct BenchmarkSuiteResultCollection { pub runs: Vec, @@ -491,6 +582,8 @@ pub struct BenchmarkSuiteResultCollection { #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct BenchmarkSuiteResult { + /// Result format version. Always `1` for results produced by this binary. + pub schema_version: u32, pub suite: String, pub environment: String, pub version: String, @@ -499,6 +592,10 @@ pub struct BenchmarkSuiteResult { /// cross-run correlation and garbage collection of orphaned state. #[serde(skip_serializing_if = "Option::is_none", default)] pub run_id: Option, + /// Cloud-mode run metadata populated from `GOLEM_BENCH_*` environment variables. + /// `None` in Spawned or Provided modes where cluster metadata is not available. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub run_metadata: Option, pub results: Vec, } @@ -530,11 +627,13 @@ impl BenchmarkSuiteResult { ); Self { + schema_version: 1, suite: suite.to_string(), environment, version: golem_common::golem_version().to_string(), timestamp: Utc::now(), run_id: None, + run_metadata: None, results: vec![], } } diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml new file mode 100644 index 0000000000..5387f35da6 --- /dev/null +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -0,0 +1,28 @@ +# Cloud-perf benchmark suite — runs the existing benchmark suite against a +# deployed Golem environment via Gateway-API hostnames (TestMode::Cloud). +# +# Run with the benchmarks binary's `cloud` subcommand: +# +# benchmarks suite integration-tests/benchmark_suites/cloud-perf.yaml \ +# --save-to-json result.json \ +# cloud \ +# --api-url https:// \ +# --apps-base-domain \ +# --admin-account-token \ +# --builtin-plugin-owner-account-id \ +# --default-plan-id \ +# --component-directory +# +# Initial milestone: latency-small only. Add the remaining six benchmarks +# once this runs end-to-end successfully. + +name: cloud-perf +benchmarks: + # Measures cold and hot invocation latency through the Gateway using the + # small Rust agent (benchmark_agent_rust_release.wasm). Three iterations + # capture real-network jitter and load-balancer warm-up variance. + - name: latency-small + iterations: 3 + clusterSize: [2] + size: [5, 10] + length: [2] diff --git a/integration-tests/src/benchmarks/all.rs b/integration-tests/src/benchmarks/all.rs index 6865ecf6e4..9b1efd1eb2 100644 --- a/integration-tests/src/benchmarks/all.rs +++ b/integration-tests/src/benchmarks/all.rs @@ -21,7 +21,7 @@ use golem_common::model::environment::{EnvironmentCreation, EnvironmentName}; use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{ Benchmark, BenchmarkApi, BenchmarkConfig, BenchmarkResult, BenchmarkSuite, BenchmarkSuiteItem, - BenchmarkSuiteResult, + BenchmarkSuiteResult, RunMetadata, }; use golem_test_framework::config::benchmark::{TestMode, cloud_bench_run_id}; use golem_test_framework::config::{ @@ -233,9 +233,16 @@ async fn main() { // no else: we already validated all names above } - // Attach the run_id to result metadata (cloud mode only). + // Attach the run_id and run_metadata to result metadata (cloud mode only). if let Some(run_id) = cloud_bench_run_id() { suite_result.run_id = Some(format!("bench-{run_id}")); + + // Read GOLEM_BENCH_* env vars set by the buildspec before invoking + // the binary. Missing vars produce None rather than failing the run. + let metadata = RunMetadata::from_env(); + if !metadata.is_empty() { + suite_result.run_metadata = Some(metadata); + } } if let Some(path) = save_to_json { From b1764ece0906ebdbae8d02fd4b1c8436468113fe Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:25:01 -0700 Subject: [PATCH 03/60] fix(benchmark): make --builtin-plugin-owner-account-id and --default-plan-id optional --- golem-test-framework/src/config/benchmark.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/golem-test-framework/src/config/benchmark.rs b/golem-test-framework/src/config/benchmark.rs index e97a58c47d..34ac140d23 100644 --- a/golem-test-framework/src/config/benchmark.rs +++ b/golem-test-framework/src/config/benchmark.rs @@ -280,10 +280,14 @@ pub enum TestMode { #[arg(long)] admin_account_token: String, /// UUID of the builtin-plugin-owner account. - #[arg(long)] + /// Only needed for environment-plugin-grant tests; benchmarks do not + /// use it so the default (nil UUID) is fine for benchmark runs. + #[arg(long, default_value_t = Uuid::nil())] builtin_plugin_owner_account_id: Uuid, /// UUID of the default plan on the target cluster. - #[arg(long)] + /// Only needed for environment-plugin-grant tests; benchmarks do not + /// use it so the default (nil UUID) is fine for benchmark runs. + #[arg(long, default_value_t = Uuid::nil())] default_plan_id: Uuid, /// Optional shard-manager gRPC hostname for a kubectl port-forward /// (e.g. `localhost`). When set together with From 4294bdbe29bf9a04f54394da9778a5cf95d48232 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Thu, 4 Jun 2026 19:16:43 -0700 Subject: [PATCH 04/60] fix: make ProvidedShardManager kill/restart no-ops instead of panics kill_all() is called after cloud_preflight_warmup completes. ProvidedShardManager wraps an already-running process we don't own, so neither kill nor restart should crash the binary. Both are now silent no-ops, matching UnavailableShardManager. --- golem-test-framework/src/components/shard_manager/provided.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/golem-test-framework/src/components/shard_manager/provided.rs b/golem-test-framework/src/components/shard_manager/provided.rs index d7e4ff1305..84d213a5fb 100644 --- a/golem-test-framework/src/components/shard_manager/provided.rs +++ b/golem-test-framework/src/components/shard_manager/provided.rs @@ -40,10 +40,10 @@ impl ShardManager for ProvidedShardManager { } async fn kill(&self) { - panic!("Cannot kill provided shard manager"); + // Nothing to do — we do not own this shard manager process. } async fn restart(&self, _number_of_shards_override: Option) { - panic!("Cannot restart provided shard manager"); + // Nothing to do — we do not own this shard manager process. } } From 5b9902b55875d772d16e1fc004160d13caf6d8a1 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 5 Jun 2026 10:44:57 -0700 Subject: [PATCH 05/60] feat(benchmark): enable all tests --- .../benchmark_suites/cloud-perf.yaml | 101 ++++++++++++++++-- 1 file changed, 94 insertions(+), 7 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index 5387f35da6..9805b18e6f 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -1,4 +1,4 @@ -# Cloud-perf benchmark suite — runs the existing benchmark suite against a +# Cloud-perf benchmark suite — runs the full benchmark suite against a # deployed Golem environment via Gateway-API hostnames (TestMode::Cloud). # # Run with the benchmarks binary's `cloud` subcommand: @@ -13,16 +13,103 @@ # --default-plan-id \ # --component-directory # -# Initial milestone: latency-small only. Add the remaining six benchmarks -# once this runs end-to-end successfully. +# Note: clusterSize is ignored in Cloud mode (the observed cluster size is +# read from shard-manager at run start and recorded in result metadata). name: cloud-perf benchmarks: - # Measures cold and hot invocation latency through the Gateway using the - # small Rust agent (benchmark_agent_rust_release.wasm). Three iterations - # capture real-network jitter and load-balancer warm-up variance. + # Cold-start: compilation cache disabled — measures true cold-start latency + # with no warm compiled artefact available. + # size = number of unique components created (each in its own env) + # length = seconds to wait per component for pre-compilation warm-up + - name: cold-start-unknown-small + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [2] + disableCompilationCache: true + + - name: cold-start-unknown-medium + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [5] + disableCompilationCache: true + + # Cold-start: compilation cache enabled — measures latency once the compiled + # artefact is available in the cache. + # size = number of unique components created (each in its own env) + # length = seconds to wait per component for pre-compilation warm-up + - name: cold-start-unknown-small + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [2] + + - name: cold-start-unknown-medium + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [5] + + # Invocation latency — hot and cold paths through the Gateway NLB. + # Large worker counts to stress the load balancer and connection pool. + # size = number of workers created + # length = number of hot invocations per worker after the first cold one - name: latency-small iterations: 3 clusterSize: [2] - size: [5, 10] + size: [100, 500, 1000, 2000, 5000] length: [2] + + - name: latency-medium + iterations: 3 + clusterSize: [2] + size: [100, 500, 1000, 2000] + length: [5] + + # Sleep — measures worker suspension and resumption under real network + # conditions. + # size = number of workers launched in parallel + # length = sleep duration in milliseconds + - name: sleep + iterations: 3 + clusterSize: [2] + size: [10, 100, 500, 1000] + length: [10000] + + # Durability overhead — measures the cost of durable vs ephemeral execution + # across four variants (durable-persistent, durable-non-persistent, + # ephemeral, durable-persistent-commit). + # size = number of workers per variant + # length = loop iteration count passed to oplog_heavy + - name: durability-overhead + iterations: 3 + clusterSize: [2] + size: [10, 50, 100, 200] + length: [5000] + + # Throughput — measures invocation throughput across six implementations. + # size = number of workers per implementation + # length = unused for echo + - name: throughput-echo + iterations: 3 + clusterSize: [2] + size: [1, 10, 50, 100, 250] + length: [1000] + + # size = number of workers per implementation + # length = payload size in bytes sent to large_input + - name: throughput-large-input + iterations: 3 + clusterSize: [2] + size: [1, 10, 25, 50] + length: [100, 10000] + + # size = number of workers per implementation + # length = CPU work length passed to cpu_intensive + - name: throughput-cpu-intensive + iterations: 3 + clusterSize: [2] + size: [1, 10, 25, 50] + length: [100] From 742a6695ad564e942026f42caec01b1dad075b5c Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 5 Jun 2026 13:34:01 -0700 Subject: [PATCH 06/60] feat: retry connectivity to shard manager --- .../src/components/shard_manager/mod.rs | 70 ++++++++++++++----- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/golem-test-framework/src/components/shard_manager/mod.rs b/golem-test-framework/src/components/shard_manager/mod.rs index 67b10051bf..bb387cd1ac 100644 --- a/golem-test-framework/src/components/shard_manager/mod.rs +++ b/golem-test-framework/src/components/shard_manager/mod.rs @@ -31,7 +31,7 @@ use std::sync::Arc; use std::time::Duration; use tonic::codec::CompressionEncoding; use tonic::transport::Channel; -use tracing::Level; +use tracing::{Level, warn}; #[async_trait] pub trait ShardManager: Send + Sync { @@ -47,25 +47,30 @@ pub trait ShardManager: Send + Sync { async fn restart(&self, number_of_shards_override: Option); async fn get_routing_table(&self) -> crate::Result { - let routing_table = self - .client() - .await - .get_routing_table(GetRoutingTableRequest {}) - .await - .expect("Unable to fetch the routing table from shard-manager-service"); - - match routing_table.into_inner() { - shardmanager::v1::GetRoutingTableResponse { - result: - Some(shardmanager::v1::get_routing_table_response::Result::Success(routing_table)), - } => Ok(routing_table - .try_into() - .map_err(|e| anyhow!("Failed converting routing table: {e}"))?), - shardmanager::v1::GetRoutingTableResponse { - result: Some(shardmanager::v1::get_routing_table_response::Result::Failure(err)), - } => Err(anyhow!("Failed to get routing table: {err:?}")), - _ => Err(anyhow!("Failed to get routing table")), + // Retry with backoff to tolerate transient port-forward reconnects. + // The port-forward watchdog restarts in ~500ms, so 10 attempts with + // 1s delay gives ~10s of tolerance before giving up. + let max_attempts = 10; + let retry_delay = Duration::from_secs(1); + let mut last_err = anyhow!("get_routing_table: no attempts made"); + + for attempt in 1..=max_attempts { + match try_get_routing_table(&self.grpc_host(), self.grpc_port()).await { + Ok(rt) => return Ok(rt), + Err(err) => { + warn!( + attempt, + max_attempts, + error = %err, + "Failed to fetch routing table, retrying..." + ); + last_err = err; + tokio::time::sleep(retry_delay).await; + } + } } + + Err(last_err) } } @@ -77,6 +82,33 @@ async fn new_client(host: &str, grpc_port: u16) -> ShardManagerServiceClient crate::Result { + let mut client = + ShardManagerServiceClient::connect(format!("http://{host}:{grpc_port}")) + .await + .map_err(|e| anyhow!("Failed to connect to shard-manager: {e}"))? + .send_compressed(CompressionEncoding::Gzip) + .accept_compressed(CompressionEncoding::Gzip); + + let routing_table = client + .get_routing_table(GetRoutingTableRequest {}) + .await + .map_err(|e| anyhow!("Unable to fetch the routing table from shard-manager-service: {e}"))?; + + match routing_table.into_inner() { + shardmanager::v1::GetRoutingTableResponse { + result: + Some(shardmanager::v1::get_routing_table_response::Result::Success(routing_table)), + } => Ok(routing_table + .try_into() + .map_err(|e| anyhow!("Failed converting routing table: {e}"))?), + shardmanager::v1::GetRoutingTableResponse { + result: Some(shardmanager::v1::get_routing_table_response::Result::Failure(err)), + } => Err(anyhow!("Failed to get routing table: {err:?}")), + _ => Err(anyhow!("Failed to get routing table")), + } +} + async fn wait_for_startup( host: &str, grpc_port: u16, From 18d5af6950c06aa02409dfd96449c35213884302 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 5 Jun 2026 14:15:16 -0700 Subject: [PATCH 07/60] chore: fmt --- .../src/components/shard_manager/mod.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/golem-test-framework/src/components/shard_manager/mod.rs b/golem-test-framework/src/components/shard_manager/mod.rs index bb387cd1ac..91ed2ed2da 100644 --- a/golem-test-framework/src/components/shard_manager/mod.rs +++ b/golem-test-framework/src/components/shard_manager/mod.rs @@ -83,17 +83,18 @@ async fn new_client(host: &str, grpc_port: u16) -> ShardManagerServiceClient crate::Result { - let mut client = - ShardManagerServiceClient::connect(format!("http://{host}:{grpc_port}")) - .await - .map_err(|e| anyhow!("Failed to connect to shard-manager: {e}"))? - .send_compressed(CompressionEncoding::Gzip) - .accept_compressed(CompressionEncoding::Gzip); + let mut client = ShardManagerServiceClient::connect(format!("http://{host}:{grpc_port}")) + .await + .map_err(|e| anyhow!("Failed to connect to shard-manager: {e}"))? + .send_compressed(CompressionEncoding::Gzip) + .accept_compressed(CompressionEncoding::Gzip); let routing_table = client .get_routing_table(GetRoutingTableRequest {}) .await - .map_err(|e| anyhow!("Unable to fetch the routing table from shard-manager-service: {e}"))?; + .map_err(|e| { + anyhow!("Unable to fetch the routing table from shard-manager-service: {e}") + })?; match routing_table.into_inner() { shardmanager::v1::GetRoutingTableResponse { From 395bcd2113017d99a398ff5a32ab072215c662dd Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 5 Jun 2026 15:48:54 -0700 Subject: [PATCH 08/60] investigation: run echo test first to see if they get stuck again --- .../benchmark_suites/cloud-perf.yaml | 59 +++++++++++-------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index 9805b18e6f..6e258c3c99 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -15,9 +15,41 @@ # # Note: clusterSize is ignored in Cloud mode (the observed cluster size is # read from shard-manager at run start and recorded in result metadata). +# +# Suite order rationale: throughput benchmarks run first because they involve +# RPC worker pairs and HTTP deployments — the most complex setup. Running them +# early surfaces infrastructure issues (stuck workers, port-forward drops) +# before spending time on the simpler benchmarks. name: cloud-perf benchmarks: + # Throughput — measures invocation throughput across six implementations: + # rust agent (gRPC), TS agent (gRPC), rust agent (HTTP), TS agent (HTTP), + # TS RPC pair, rust RPC pair. + # size = number of workers per implementation (×6 implementations total) + # length = unused for echo + - name: throughput-echo + iterations: 3 + clusterSize: [2] + size: [1, 10, 50, 100, 250] + length: [1000] + + # size = number of workers per implementation + # length = payload size in bytes sent to large_input + - name: throughput-large-input + iterations: 3 + clusterSize: [2] + size: [1, 10, 25, 50] + length: [100, 10000] + + # size = number of workers per implementation + # length = CPU work length passed to cpu_intensive + - name: throughput-cpu-intensive + iterations: 3 + clusterSize: [2] + size: [1, 10, 25, 50] + length: [100] + # Cold-start: compilation cache disabled — measures true cold-start latency # with no warm compiled artefact available. # size = number of unique components created (each in its own env) @@ -40,6 +72,8 @@ benchmarks: # artefact is available in the cache. # size = number of unique components created (each in its own env) # length = seconds to wait per component for pre-compilation warm-up + # NOTE: if results here are close to the cache-disabled entries above, the + # warm-up wait is too short and compilation hasn't finished — bump length. - name: cold-start-unknown-small iterations: 3 clusterSize: [2] @@ -88,28 +122,3 @@ benchmarks: clusterSize: [2] size: [10, 50, 100, 200] length: [5000] - - # Throughput — measures invocation throughput across six implementations. - # size = number of workers per implementation - # length = unused for echo - - name: throughput-echo - iterations: 3 - clusterSize: [2] - size: [1, 10, 50, 100, 250] - length: [1000] - - # size = number of workers per implementation - # length = payload size in bytes sent to large_input - - name: throughput-large-input - iterations: 3 - clusterSize: [2] - size: [1, 10, 25, 50] - length: [100, 10000] - - # size = number of workers per implementation - # length = CPU work length passed to cpu_intensive - - name: throughput-cpu-intensive - iterations: 3 - clusterSize: [2] - size: [1, 10, 25, 50] - length: [100] From dac3c697826b2bc25192b5c14cb3e40ef072e821 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:28:27 -0700 Subject: [PATCH 09/60] feat(benchmark): lower number of conccurent live apps --- integration-tests/benchmark_suites/cloud-perf.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index 6e258c3c99..b91a3d1821 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -31,7 +31,7 @@ benchmarks: - name: throughput-echo iterations: 3 clusterSize: [2] - size: [1, 10, 50, 100, 250] + size: [1, 10, 50, 100] length: [1000] # size = number of workers per implementation From 22566231cfd3e148fde30c84253fa597b366c378 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 5 Jun 2026 21:53:40 -0700 Subject: [PATCH 10/60] feat: more observability, make memory component coefficient configurable --- .../config/debug-worker-executor.sample.env | 2 + .../config/debug-worker-executor.toml | 2 + .../config/worker-executor.sample.env | 3 ++ .../config/worker-executor.toml | 3 ++ golem-worker-executor/src/metrics.rs | 42 +++++++++++++------ .../src/services/golem_config.rs | 12 ++++++ golem-worker-executor/src/worker/mod.rs | 10 ++++- 7 files changed, 61 insertions(+), 13 deletions(-) diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env index 077c693c32..66afafc82a 100644 --- a/golem-debugging-service/config/debug-worker-executor.sample.env +++ b/golem-debugging-service/config/debug-worker-executor.sample.env @@ -55,6 +55,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 @@ -228,6 +229,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml index 8ee03a9c23..7191c36996 100644 --- a/golem-debugging-service/config/debug-worker-executor.toml +++ b/golem-debugging-service/config/debug-worker-executor.toml @@ -96,6 +96,7 @@ max_oplog_query_pages_size = 100 [memory] acquire_retry_delay = "500ms" +component_size_coefficient = 2.0 worker_estimate_coefficient = 1.1 worker_memory_ratio = 0.8 @@ -364,6 +365,7 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" +# component_size_coefficient = 2.0 # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 # diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env index 2a52884966..2ef7701cc5 100644 --- a/golem-worker-executor/config/worker-executor.sample.env +++ b/golem-worker-executor/config/worker-executor.sample.env @@ -72,6 +72,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 @@ -291,6 +292,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 @@ -480,6 +482,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml index 4c89275519..b1bab39be9 100644 --- a/golem-worker-executor/config/worker-executor.toml +++ b/golem-worker-executor/config/worker-executor.toml @@ -125,6 +125,7 @@ max_oplog_query_pages_size = 100 [memory] acquire_retry_delay = "500ms" +component_size_coefficient = 2.0 worker_estimate_coefficient = 1.1 worker_memory_ratio = 0.8 @@ -456,6 +457,7 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" +# component_size_coefficient = 2.0 # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 # @@ -757,6 +759,7 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" +# component_size_coefficient = 2.0 # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 # diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs index 768b3e6b98..c9d610e79d 100644 --- a/golem-worker-executor/src/metrics.rs +++ b/golem-worker-executor/src/metrics.rs @@ -69,18 +69,26 @@ const SCHEDULER_LAG_BUCKETS: &[f64; 11] = &[ 0.001, 0.01, 0.1, 1.0, 5.0, 15.0, 30.0, 60.0, 120.0, 300.0, 600.0, ]; -const MEMORY_SIZE_BUCKETS: &[f64; 11] = &[ - 1024.0, - 4096.0, - 16384.0, - 65536.0, - 262144.0, - 1048576.0, - 4194304.0, - 16777216.0, - 67108864.0, - 268435456.0, - 1073741824.0, +/// Buckets for the size of a single `memory.grow` allocation. Deliberately +/// fine-grained in the 1-32 MiB band where typical guest grows cluster, so +/// that p90/p99 quantiles are not pinned to a coarse 4-16 MiB bucket edge. +const MEMORY_SIZE_BUCKETS: &[f64; 16] = &[ + 65536.0, // 64 KiB + 262144.0, // 256 KiB + 1048576.0, // 1 MiB + 2097152.0, // 2 MiB + 4194304.0, // 4 MiB + 6291456.0, // 6 MiB + 8388608.0, // 8 MiB + 12582912.0, // 12 MiB + 16777216.0, // 16 MiB + 25165824.0, // 24 MiB + 33554432.0, // 32 MiB + 67108864.0, // 64 MiB + 134217728.0, // 128 MiB + 268435456.0, // 256 MiB + 536870912.0, // 512 MiB + 1073741824.0, // 1 GiB ]; pub mod component { @@ -508,6 +516,12 @@ pub mod wasm { crate::metrics::MEMORY_SIZE_BUCKETS.to_vec() ) .unwrap(); + static ref WORKER_RESIDENT_LINEAR_MEMORY_BYTES: Histogram = register_histogram!( + "worker_resident_linear_memory_bytes", + "Per-worker cumulative linear memory size (total_linear_memory_size) observed when acquiring a memory permit", + crate::metrics::MEMORY_SIZE_BUCKETS.to_vec() + ) + .unwrap(); } lazy_static! { @@ -580,6 +594,10 @@ pub mod wasm { pub fn record_allocated_memory(amount: usize) { ALLOCATED_MEMORY_BYTES.observe(amount as f64); } + + pub fn record_worker_resident_linear_memory(bytes: u64) { + WORKER_RESIDENT_LINEAR_MEMORY_BYTES.observe(bytes as f64); + } } pub mod oplog { diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index 733d9529af..76b7720bf0 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -963,6 +963,12 @@ pub struct MemoryConfig { pub system_memory_override: Option, pub worker_memory_ratio: f64, pub worker_estimate_coefficient: f64, + /// Multiplier applied to a worker's `component_size` when estimating its + /// memory permit requirement. The compiled component is loaded into the + /// engine once per component (shared across all workers of that component), + /// so this term over-accounts per-worker memory for large components. + /// Lower this (e.g. to 0.0) to size permits primarily off linear memory. + pub component_size_coefficient: f64, #[serde(with = "humantime_serde")] pub acquire_retry_delay: Duration, pub oom_retry_config: RetryConfig, @@ -1004,6 +1010,11 @@ impl SafeDisplay for MemoryConfig { "worker estimate coefficient: {}", self.worker_estimate_coefficient ); + let _ = writeln!( + &mut result, + "component size coefficient: {}", + self.component_size_coefficient + ); let _ = writeln!( &mut result, "acquire retry delay: {:?}", @@ -1528,6 +1539,7 @@ impl Default for MemoryConfig { system_memory_override: None, worker_memory_ratio: 0.8, worker_estimate_coefficient: 1.1, + component_size_coefficient: 2.0, acquire_retry_delay: Duration::from_millis(500), oom_retry_config: RetryConfig { max_attempts: u32::MAX, diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs index 1e6d4fa7cc..a65d6dd867 100644 --- a/golem-worker-executor/src/worker/mod.rs +++ b/golem-worker-executor/src/worker/mod.rs @@ -122,6 +122,7 @@ pub struct Worker { execution_status: Arc>, update_state_lock: Mutex<()>, worker_estimate_coefficient: f64, + component_size_coefficient: f64, // IMPORTANT: Every external operation must acquire the instance lock, even briefly, to confirm the worker isn’t deleting. instance: Arc>, @@ -340,6 +341,7 @@ impl Worker { last_known_status: current_status, metrics_status, worker_estimate_coefficient: deps.config().memory.worker_estimate_coefficient, + component_size_coefficient: deps.config().memory.component_size_coefficient, oom_retry_config: deps.config().memory.oom_retry_config.clone(), snapshot_policy, update_state_lock: Mutex::new(()), @@ -410,6 +412,12 @@ impl Worker { WorkerInstance::Unloaded { .. } => { this.mark_as_loading(); crate::metrics::workers::inc_worker_waiting_for_memory(); + crate::metrics::wasm::record_worker_resident_linear_memory( + this.get_latest_worker_metadata() + .await + .last_known_status + .total_linear_memory_size, + ); *instance_guard = WorkerInstance::WaitingForPermit(WaitingWorker::new( this.clone(), this.memory_requirement().await?, @@ -795,7 +803,7 @@ impl Worker { let ml = metadata.last_known_status.total_linear_memory_size as f64; let sw = metadata.last_known_status.component_size as f64; - let c = 2.0; + let c = self.component_size_coefficient; let x = self.worker_estimate_coefficient; Ok((x * (ml + c * sw)) as u64) } From 02e527a7ce7d25306a28f98088eef30baf959168 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 5 Jun 2026 23:45:01 -0700 Subject: [PATCH 11/60] feat(benchmark): run only throughput-echo test --- .../benchmark_suites/cloud-perf.yaml | 184 +++++++++--------- 1 file changed, 96 insertions(+), 88 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index b91a3d1821..9cb7bded85 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -20,6 +20,10 @@ # RPC worker pairs and HTTP deployments — the most complex setup. Running them # early surfaces infrastructure issues (stuck workers, port-forward drops) # before spending time on the simpler benchmarks. +# +# NOTE: All benchmarks except throughput-echo are temporarily commented out for +# the memory-semaphore (component_size_coefficient) knob experiment. Restore +# them to run the full suite. name: cloud-perf benchmarks: @@ -34,91 +38,95 @@ benchmarks: size: [1, 10, 50, 100] length: [1000] - # size = number of workers per implementation - # length = payload size in bytes sent to large_input - - name: throughput-large-input - iterations: 3 - clusterSize: [2] - size: [1, 10, 25, 50] - length: [100, 10000] - - # size = number of workers per implementation - # length = CPU work length passed to cpu_intensive - - name: throughput-cpu-intensive - iterations: 3 - clusterSize: [2] - size: [1, 10, 25, 50] - length: [100] - - # Cold-start: compilation cache disabled — measures true cold-start latency - # with no warm compiled artefact available. - # size = number of unique components created (each in its own env) - # length = seconds to wait per component for pre-compilation warm-up - - name: cold-start-unknown-small - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 20] - length: [2] - disableCompilationCache: true - - - name: cold-start-unknown-medium - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 20] - length: [5] - disableCompilationCache: true - - # Cold-start: compilation cache enabled — measures latency once the compiled - # artefact is available in the cache. - # size = number of unique components created (each in its own env) - # length = seconds to wait per component for pre-compilation warm-up - # NOTE: if results here are close to the cache-disabled entries above, the - # warm-up wait is too short and compilation hasn't finished — bump length. - - name: cold-start-unknown-small - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 20] - length: [2] - - - name: cold-start-unknown-medium - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 20] - length: [5] - - # Invocation latency — hot and cold paths through the Gateway NLB. - # Large worker counts to stress the load balancer and connection pool. - # size = number of workers created - # length = number of hot invocations per worker after the first cold one - - name: latency-small - iterations: 3 - clusterSize: [2] - size: [100, 500, 1000, 2000, 5000] - length: [2] - - - name: latency-medium - iterations: 3 - clusterSize: [2] - size: [100, 500, 1000, 2000] - length: [5] - - # Sleep — measures worker suspension and resumption under real network - # conditions. - # size = number of workers launched in parallel - # length = sleep duration in milliseconds - - name: sleep - iterations: 3 - clusterSize: [2] - size: [10, 100, 500, 1000] - length: [10000] - - # Durability overhead — measures the cost of durable vs ephemeral execution - # across four variants (durable-persistent, durable-non-persistent, - # ephemeral, durable-persistent-commit). - # size = number of workers per variant - # length = loop iteration count passed to oplog_heavy - - name: durability-overhead - iterations: 3 - clusterSize: [2] - size: [10, 50, 100, 200] - length: [5000] + # TEMPORARILY DISABLED: only throughput-echo runs for the memory-semaphore + # (component_size_coefficient) knob experiment, to get faster A/B results. + # Restore the entries below to run the full suite again. + # + # # size = number of workers per implementation + # # length = payload size in bytes sent to large_input + # - name: throughput-large-input + # iterations: 3 + # clusterSize: [2] + # size: [1, 10, 25, 50] + # length: [100, 10000] + # + # # size = number of workers per implementation + # # length = CPU work length passed to cpu_intensive + # - name: throughput-cpu-intensive + # iterations: 3 + # clusterSize: [2] + # size: [1, 10, 25, 50] + # length: [100] + # + # # Cold-start: compilation cache disabled — measures true cold-start latency + # # with no warm compiled artefact available. + # # size = number of unique components created (each in its own env) + # # length = seconds to wait per component for pre-compilation warm-up + # - name: cold-start-unknown-small + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 20] + # length: [2] + # disableCompilationCache: true + # + # - name: cold-start-unknown-medium + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 20] + # length: [5] + # disableCompilationCache: true + # + # # Cold-start: compilation cache enabled — measures latency once the compiled + # # artefact is available in the cache. + # # size = number of unique components created (each in its own env) + # # length = seconds to wait per component for pre-compilation warm-up + # # NOTE: if results here are close to the cache-disabled entries above, the + # # warm-up wait is too short and compilation hasn't finished — bump length. + # - name: cold-start-unknown-small + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 20] + # length: [2] + # + # - name: cold-start-unknown-medium + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 20] + # length: [5] + # + # # Invocation latency — hot and cold paths through the Gateway NLB. + # # Large worker counts to stress the load balancer and connection pool. + # # size = number of workers created + # # length = number of hot invocations per worker after the first cold one + # - name: latency-small + # iterations: 3 + # clusterSize: [2] + # size: [100, 500, 1000, 2000, 5000] + # length: [2] + # + # - name: latency-medium + # iterations: 3 + # clusterSize: [2] + # size: [100, 500, 1000, 2000] + # length: [5] + # + # # Sleep — measures worker suspension and resumption under real network + # # conditions. + # # size = number of workers launched in parallel + # # length = sleep duration in milliseconds + # - name: sleep + # iterations: 3 + # clusterSize: [2] + # size: [10, 100, 500, 1000] + # length: [10000] + # + # # Durability overhead — measures the cost of durable vs ephemeral execution + # # across four variants (durable-persistent, durable-non-persistent, + # # ephemeral, durable-persistent-commit). + # # size = number of workers per variant + # # length = loop iteration count passed to oplog_heavy + # - name: durability-overhead + # iterations: 3 + # clusterSize: [2] + # size: [10, 50, 100, 200] + # length: [5000] From faeb65149bdc0552b15f8eaf53d1eb1fb389324d Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 5 Jun 2026 23:50:44 -0700 Subject: [PATCH 12/60] feat(bench): try 200 apps after tuning --- integration-tests/benchmark_suites/cloud-perf.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index 9cb7bded85..bcd4faaa6d 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -35,7 +35,7 @@ benchmarks: - name: throughput-echo iterations: 3 clusterSize: [2] - size: [1, 10, 50, 100] + size: [1, 10, 50, 100, 200] length: [1000] # TEMPORARILY DISABLED: only throughput-echo runs for the memory-semaphore From f8dd565f6049538247e52edab148ab69f174127e Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sat, 6 Jun 2026 01:04:56 -0700 Subject: [PATCH 13/60] feat: try 250 again --- integration-tests/benchmark_suites/cloud-perf.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index bcd4faaa6d..d4ff6e3b23 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -35,7 +35,7 @@ benchmarks: - name: throughput-echo iterations: 3 clusterSize: [2] - size: [1, 10, 50, 100, 200] + size: [1, 10, 50, 100, 250] length: [1000] # TEMPORARILY DISABLED: only throughput-echo runs for the memory-semaphore From 1bf006314d5210c6d9434209d338be02c865da03 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sat, 6 Jun 2026 21:52:55 -0700 Subject: [PATCH 14/60] feat(benchmark): run all the tests again --- .../benchmark_suites/cloud-perf.yaml | 184 +++++++++--------- .../src/benchmarks/throughput.rs | 47 ++++- 2 files changed, 134 insertions(+), 97 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index d4ff6e3b23..6e258c3c99 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -20,10 +20,6 @@ # RPC worker pairs and HTTP deployments — the most complex setup. Running them # early surfaces infrastructure issues (stuck workers, port-forward drops) # before spending time on the simpler benchmarks. -# -# NOTE: All benchmarks except throughput-echo are temporarily commented out for -# the memory-semaphore (component_size_coefficient) knob experiment. Restore -# them to run the full suite. name: cloud-perf benchmarks: @@ -38,95 +34,91 @@ benchmarks: size: [1, 10, 50, 100, 250] length: [1000] - # TEMPORARILY DISABLED: only throughput-echo runs for the memory-semaphore - # (component_size_coefficient) knob experiment, to get faster A/B results. - # Restore the entries below to run the full suite again. - # - # # size = number of workers per implementation - # # length = payload size in bytes sent to large_input - # - name: throughput-large-input - # iterations: 3 - # clusterSize: [2] - # size: [1, 10, 25, 50] - # length: [100, 10000] - # - # # size = number of workers per implementation - # # length = CPU work length passed to cpu_intensive - # - name: throughput-cpu-intensive - # iterations: 3 - # clusterSize: [2] - # size: [1, 10, 25, 50] - # length: [100] - # - # # Cold-start: compilation cache disabled — measures true cold-start latency - # # with no warm compiled artefact available. - # # size = number of unique components created (each in its own env) - # # length = seconds to wait per component for pre-compilation warm-up - # - name: cold-start-unknown-small - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 20] - # length: [2] - # disableCompilationCache: true - # - # - name: cold-start-unknown-medium - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 20] - # length: [5] - # disableCompilationCache: true - # - # # Cold-start: compilation cache enabled — measures latency once the compiled - # # artefact is available in the cache. - # # size = number of unique components created (each in its own env) - # # length = seconds to wait per component for pre-compilation warm-up - # # NOTE: if results here are close to the cache-disabled entries above, the - # # warm-up wait is too short and compilation hasn't finished — bump length. - # - name: cold-start-unknown-small - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 20] - # length: [2] - # - # - name: cold-start-unknown-medium - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 20] - # length: [5] - # - # # Invocation latency — hot and cold paths through the Gateway NLB. - # # Large worker counts to stress the load balancer and connection pool. - # # size = number of workers created - # # length = number of hot invocations per worker after the first cold one - # - name: latency-small - # iterations: 3 - # clusterSize: [2] - # size: [100, 500, 1000, 2000, 5000] - # length: [2] - # - # - name: latency-medium - # iterations: 3 - # clusterSize: [2] - # size: [100, 500, 1000, 2000] - # length: [5] - # - # # Sleep — measures worker suspension and resumption under real network - # # conditions. - # # size = number of workers launched in parallel - # # length = sleep duration in milliseconds - # - name: sleep - # iterations: 3 - # clusterSize: [2] - # size: [10, 100, 500, 1000] - # length: [10000] - # - # # Durability overhead — measures the cost of durable vs ephemeral execution - # # across four variants (durable-persistent, durable-non-persistent, - # # ephemeral, durable-persistent-commit). - # # size = number of workers per variant - # # length = loop iteration count passed to oplog_heavy - # - name: durability-overhead - # iterations: 3 - # clusterSize: [2] - # size: [10, 50, 100, 200] - # length: [5000] + # size = number of workers per implementation + # length = payload size in bytes sent to large_input + - name: throughput-large-input + iterations: 3 + clusterSize: [2] + size: [1, 10, 25, 50] + length: [100, 10000] + + # size = number of workers per implementation + # length = CPU work length passed to cpu_intensive + - name: throughput-cpu-intensive + iterations: 3 + clusterSize: [2] + size: [1, 10, 25, 50] + length: [100] + + # Cold-start: compilation cache disabled — measures true cold-start latency + # with no warm compiled artefact available. + # size = number of unique components created (each in its own env) + # length = seconds to wait per component for pre-compilation warm-up + - name: cold-start-unknown-small + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [2] + disableCompilationCache: true + + - name: cold-start-unknown-medium + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [5] + disableCompilationCache: true + + # Cold-start: compilation cache enabled — measures latency once the compiled + # artefact is available in the cache. + # size = number of unique components created (each in its own env) + # length = seconds to wait per component for pre-compilation warm-up + # NOTE: if results here are close to the cache-disabled entries above, the + # warm-up wait is too short and compilation hasn't finished — bump length. + - name: cold-start-unknown-small + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [2] + + - name: cold-start-unknown-medium + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [5] + + # Invocation latency — hot and cold paths through the Gateway NLB. + # Large worker counts to stress the load balancer and connection pool. + # size = number of workers created + # length = number of hot invocations per worker after the first cold one + - name: latency-small + iterations: 3 + clusterSize: [2] + size: [100, 500, 1000, 2000, 5000] + length: [2] + + - name: latency-medium + iterations: 3 + clusterSize: [2] + size: [100, 500, 1000, 2000] + length: [5] + + # Sleep — measures worker suspension and resumption under real network + # conditions. + # size = number of workers launched in parallel + # length = sleep duration in milliseconds + - name: sleep + iterations: 3 + clusterSize: [2] + size: [10, 100, 500, 1000] + length: [10000] + + # Durability overhead — measures the cost of durable vs ephemeral execution + # across four variants (durable-persistent, durable-non-persistent, + # ephemeral, durable-persistent-commit). + # size = number of workers per variant + # length = loop iteration count passed to oplog_heavy + - name: durability-overhead + iterations: 3 + clusterSize: [2] + size: [10, 50, 100, 200] + length: [5000] diff --git a/integration-tests/src/benchmarks/throughput.rs b/integration-tests/src/benchmarks/throughput.rs index 5515090847..f3552e0eee 100644 --- a/integration-tests/src/benchmarks/throughput.rs +++ b/integration-tests/src/benchmarks/throughput.rs @@ -29,7 +29,7 @@ use golem_common::model::http_api_deployment::{ }; use golem_common::model::{AgentId, RoutingTable}; use golem_common::{agent_id, data_value}; -use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig}; +use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, ResultKey, RunConfig}; use golem_test_framework::config::benchmark::TestMode; use golem_test_framework::config::dsl_impl::TestUserContext; use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies}; @@ -38,6 +38,7 @@ use indoc::indoc; use reqwest::{Body, Method, Request, Url}; use serde_json::json; use std::collections::BTreeMap; +use std::time::Instant; use tracing::{Instrument, Level, info}; pub struct ThroughputEcho { @@ -460,6 +461,31 @@ fn agent_ids_to_agent_ids(component_id: ComponentId, ids: &[ParsedAgentId]) -> V .collect() } +/// Records aggregate throughput (invocations per second) for a measurement +/// block as a `count` result under the key `{prefix}throughput-ops-per-sec`. +/// +/// `total_calls` is the total number of invocations issued across all targets +/// in the block; `elapsed` is the wall-clock duration of the concurrently +/// executed block. Throughput is therefore the realised aggregate rate the +/// cluster sustained for this implementation, not a per-call latency. +fn record_throughput( + recorder: &BenchmarkRecorder, + prefix: &str, + total_calls: usize, + elapsed: std::time::Duration, +) { + let secs = elapsed.as_secs_f64(); + if secs <= 0.0 || total_calls == 0 { + return; + } + let ops_per_sec = (total_calls as f64 / secs).round() as u64; + info!("{prefix}throughput: {total_calls} calls in {secs:.3}s = {ops_per_sec} ops/sec"); + recorder.count( + &ResultKey::primary(format!("{prefix}throughput-ops-per-sec")), + ops_per_sec, + ); +} + impl ThroughputBenchmark { pub async fn new( rust_method_name: &str, @@ -796,7 +822,10 @@ impl ThroughputBenchmark { }) .collect::>(); + let started = Instant::now(); let results = result_futures.join().await; + let elapsed = started.elapsed(); + record_throughput(recorder, prefix, targets.len() * call_count, elapsed); for (idx, (results, target)) in results.iter().zip(targets).enumerate() { let prefix = target.prefix(prefix, routing_table); for result in results { @@ -903,7 +932,15 @@ impl ThroughputBenchmark { }) .collect::>(); + let started = Instant::now(); let results = result_futures.join().await; + let elapsed = started.elapsed(); + record_throughput( + &recorder, + "rust-agent-http-", + iteration.rust_agent_ids_for_http.len() * self.call_count, + elapsed, + ); for (idx, results) in results.iter().enumerate() { for result in results { result.record(&recorder, "rust-agent-http-", idx.to_string().as_str()); @@ -936,7 +973,15 @@ impl ThroughputBenchmark { }) .collect::>(); + let started = Instant::now(); let results = result_futures.join().await; + let elapsed = started.elapsed(); + record_throughput( + &recorder, + "ts-agent-http-", + iteration.ts_agent_ids_for_http.len() * self.call_count, + elapsed, + ); for (idx, results) in results.iter().enumerate() { for result in results { result.record(&recorder, "ts-agent-http-", idx.to_string().as_str()); From 2e53af6bb401a26f17caf042cde010bc381be32e Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sat, 6 Jun 2026 23:34:22 -0700 Subject: [PATCH 15/60] fix: metric description --- golem-worker-executor/src/metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs index c9d610e79d..de6d673632 100644 --- a/golem-worker-executor/src/metrics.rs +++ b/golem-worker-executor/src/metrics.rs @@ -518,7 +518,7 @@ pub mod wasm { .unwrap(); static ref WORKER_RESIDENT_LINEAR_MEMORY_BYTES: Histogram = register_histogram!( "worker_resident_linear_memory_bytes", - "Per-worker cumulative linear memory size (total_linear_memory_size) observed when acquiring a memory permit", + "Per-worker cumulative linear-memory ceiling (total_linear_memory_size = sum of memory.grow deltas) sampled at permit acquire. This is the semaphore charge basis (x*ml), an upper bound on resident RSS, NOT measured resident memory (grown pages are largely demand-paged); compare to container_memory_working_set_bytes for the gap", crate::metrics::MEMORY_SIZE_BUCKETS.to_vec() ) .unwrap(); From 32ef9e59a1eb5edfd0f96c2d038c32a8c7c20c8b Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sun, 7 Jun 2026 00:23:48 -0700 Subject: [PATCH 16/60] feat: proper load for our cluster --- .../benchmark_suites/cloud-perf.yaml | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index 6e258c3c99..088e5c43f7 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -36,10 +36,13 @@ benchmarks: # size = number of workers per implementation # length = payload size in bytes sent to large_input + # NOTE: large payloads grow worker linear memory, so this is the throughput + # benchmark most relevant to the memory-admission investigation — sized to + # match throughput-echo so it exercises real density. - name: throughput-large-input iterations: 3 clusterSize: [2] - size: [1, 10, 25, 50] + size: [1, 10, 50, 100, 250] length: [100, 10000] # size = number of workers per implementation @@ -47,13 +50,18 @@ benchmarks: - name: throughput-cpu-intensive iterations: 3 clusterSize: [2] - size: [1, 10, 25, 50] + size: [1, 10, 50, 100, 250] length: [100] # Cold-start: compilation cache disabled — measures true cold-start latency # with no warm compiled artefact available. # size = number of unique components created (each in its own env) # length = seconds to wait per component for pre-compilation warm-up + # NOTE: each unit here is a UNIQUE component, so size scales compilations + # (compilation-service + S3 cache load), not worker density. Kept at max 20: + # cold-start latency is already characterized there, and the cache-enabled + # warmup sleeps length*size seconds per run, so larger sizes mostly add idle + # wait rather than signal. - name: cold-start-unknown-small iterations: 3 clusterSize: [2] @@ -103,22 +111,25 @@ benchmarks: length: [5] # Sleep — measures worker suspension and resumption under real network - # conditions. + # conditions. High residency: all `size` workers held in memory sleeping at + # once, so this also probes how many resident workers fit (memory-admission + # relevant) — pushed past the ~2000 echo proved out. # size = number of workers launched in parallel # length = sleep duration in milliseconds - name: sleep iterations: 3 clusterSize: [2] - size: [10, 100, 500, 1000] + size: [10, 100, 500, 1000, 2000] length: [10000] # Durability overhead — measures the cost of durable vs ephemeral execution # across four variants (durable-persistent, durable-non-persistent, - # ephemeral, durable-persistent-commit). + # ephemeral, durable-persistent-commit). size workers concurrent per phase; + # sized up to put real load on the oplog/persistence/storage path. # size = number of workers per variant # length = loop iteration count passed to oplog_heavy - name: durability-overhead iterations: 3 clusterSize: [2] - size: [10, 50, 100, 200] + size: [10, 50, 100, 250] length: [5000] From bc117799dad080b28bfbe55467f42a3233e3d257 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sun, 7 Jun 2026 21:00:41 -0700 Subject: [PATCH 17/60] feat(benchmark): run only benchmark tests --- .../benchmark_suites/cloud-perf.yaml | 152 +++++++++--------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index 088e5c43f7..f7aab7e9f4 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -31,7 +31,7 @@ benchmarks: - name: throughput-echo iterations: 3 clusterSize: [2] - size: [1, 10, 50, 100, 250] + size: [1, 50, 100, 250] length: [1000] # size = number of workers per implementation @@ -42,7 +42,7 @@ benchmarks: - name: throughput-large-input iterations: 3 clusterSize: [2] - size: [1, 10, 50, 100, 250] + size: [1, 50, 100, 250] length: [100, 10000] # size = number of workers per implementation @@ -50,86 +50,86 @@ benchmarks: - name: throughput-cpu-intensive iterations: 3 clusterSize: [2] - size: [1, 10, 50, 100, 250] + size: [1, 50, 100, 250] length: [100] - # Cold-start: compilation cache disabled — measures true cold-start latency - # with no warm compiled artefact available. - # size = number of unique components created (each in its own env) - # length = seconds to wait per component for pre-compilation warm-up - # NOTE: each unit here is a UNIQUE component, so size scales compilations - # (compilation-service + S3 cache load), not worker density. Kept at max 20: - # cold-start latency is already characterized there, and the cache-enabled - # warmup sleeps length*size seconds per run, so larger sizes mostly add idle - # wait rather than signal. - - name: cold-start-unknown-small - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 20] - length: [2] - disableCompilationCache: true + # # Cold-start: compilation cache disabled — measures true cold-start latency + # # with no warm compiled artefact available. + # # size = number of unique components created (each in its own env) + # # length = seconds to wait per component for pre-compilation warm-up + # # NOTE: each unit here is a UNIQUE component, so size scales compilations + # # (compilation-service + S3 cache load), not worker density. Kept at max 20: + # # cold-start latency is already characterized there, and the cache-enabled + # # warmup sleeps length*size seconds per run, so larger sizes mostly add idle + # # wait rather than signal. + # - name: cold-start-unknown-small + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 20] + # length: [2] + # disableCompilationCache: true - - name: cold-start-unknown-medium - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 20] - length: [5] - disableCompilationCache: true + # - name: cold-start-unknown-medium + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 20] + # length: [5] + # disableCompilationCache: true - # Cold-start: compilation cache enabled — measures latency once the compiled - # artefact is available in the cache. - # size = number of unique components created (each in its own env) - # length = seconds to wait per component for pre-compilation warm-up - # NOTE: if results here are close to the cache-disabled entries above, the - # warm-up wait is too short and compilation hasn't finished — bump length. - - name: cold-start-unknown-small - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 20] - length: [2] + # # Cold-start: compilation cache enabled — measures latency once the compiled + # # artefact is available in the cache. + # # size = number of unique components created (each in its own env) + # # length = seconds to wait per component for pre-compilation warm-up + # # NOTE: if results here are close to the cache-disabled entries above, the + # # warm-up wait is too short and compilation hasn't finished — bump length. + # - name: cold-start-unknown-small + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 20] + # length: [2] - - name: cold-start-unknown-medium - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 20] - length: [5] + # - name: cold-start-unknown-medium + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 20] + # length: [5] - # Invocation latency — hot and cold paths through the Gateway NLB. - # Large worker counts to stress the load balancer and connection pool. - # size = number of workers created - # length = number of hot invocations per worker after the first cold one - - name: latency-small - iterations: 3 - clusterSize: [2] - size: [100, 500, 1000, 2000, 5000] - length: [2] + # # Invocation latency — hot and cold paths through the Gateway NLB. + # # Large worker counts to stress the load balancer and connection pool. + # # size = number of workers created + # # length = number of hot invocations per worker after the first cold one + # - name: latency-small + # iterations: 3 + # clusterSize: [2] + # size: [100, 500, 1000, 2000, 5000] + # length: [2] - - name: latency-medium - iterations: 3 - clusterSize: [2] - size: [100, 500, 1000, 2000] - length: [5] + # - name: latency-medium + # iterations: 3 + # clusterSize: [2] + # size: [100, 500, 1000, 2000] + # length: [5] - # Sleep — measures worker suspension and resumption under real network - # conditions. High residency: all `size` workers held in memory sleeping at - # once, so this also probes how many resident workers fit (memory-admission - # relevant) — pushed past the ~2000 echo proved out. - # size = number of workers launched in parallel - # length = sleep duration in milliseconds - - name: sleep - iterations: 3 - clusterSize: [2] - size: [10, 100, 500, 1000, 2000] - length: [10000] + # # Sleep — measures worker suspension and resumption under real network + # # conditions. High residency: all `size` workers held in memory sleeping at + # # once, so this also probes how many resident workers fit (memory-admission + # # relevant) — pushed past the ~2000 echo proved out. + # # size = number of workers launched in parallel + # # length = sleep duration in milliseconds + # - name: sleep + # iterations: 3 + # clusterSize: [2] + # size: [10, 100, 500, 1000, 2000] + # length: [10000] - # Durability overhead — measures the cost of durable vs ephemeral execution - # across four variants (durable-persistent, durable-non-persistent, - # ephemeral, durable-persistent-commit). size workers concurrent per phase; - # sized up to put real load on the oplog/persistence/storage path. - # size = number of workers per variant - # length = loop iteration count passed to oplog_heavy - - name: durability-overhead - iterations: 3 - clusterSize: [2] - size: [10, 50, 100, 250] - length: [5000] + # # Durability overhead — measures the cost of durable vs ephemeral execution + # # across four variants (durable-persistent, durable-non-persistent, + # # ephemeral, durable-persistent-commit). size workers concurrent per phase; + # # sized up to put real load on the oplog/persistence/storage path. + # # size = number of workers per variant + # # length = loop iteration count passed to oplog_heavy + # - name: durability-overhead + # iterations: 3 + # clusterSize: [2] + # size: [10, 50, 100, 250] + # length: [5000] From 534762695f6ba8a8288f4c6c10c6acc237de69bd Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sun, 7 Jun 2026 21:04:21 -0700 Subject: [PATCH 18/60] feat: enable all tests again --- .../benchmark_suites/cloud-perf.yaml | 146 +++++++++--------- 1 file changed, 73 insertions(+), 73 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index f7aab7e9f4..81ff01bfc5 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -53,83 +53,83 @@ benchmarks: size: [1, 50, 100, 250] length: [100] - # # Cold-start: compilation cache disabled — measures true cold-start latency - # # with no warm compiled artefact available. - # # size = number of unique components created (each in its own env) - # # length = seconds to wait per component for pre-compilation warm-up - # # NOTE: each unit here is a UNIQUE component, so size scales compilations - # # (compilation-service + S3 cache load), not worker density. Kept at max 20: - # # cold-start latency is already characterized there, and the cache-enabled - # # warmup sleeps length*size seconds per run, so larger sizes mostly add idle - # # wait rather than signal. - # - name: cold-start-unknown-small - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 20] - # length: [2] - # disableCompilationCache: true + # Cold-start: compilation cache disabled — measures true cold-start latency + # with no warm compiled artefact available. + # size = number of unique components created (each in its own env) + # length = seconds to wait per component for pre-compilation warm-up + # NOTE: each unit here is a UNIQUE component, so size scales compilations + # (compilation-service + S3 cache load), not worker density. Kept at max 20: + # cold-start latency is already characterized there, and the cache-enabled + # warmup sleeps length*size seconds per run, so larger sizes mostly add idle + # wait rather than signal. + - name: cold-start-unknown-small + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [2] + disableCompilationCache: true - # - name: cold-start-unknown-medium - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 20] - # length: [5] - # disableCompilationCache: true + - name: cold-start-unknown-medium + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [5] + disableCompilationCache: true - # # Cold-start: compilation cache enabled — measures latency once the compiled - # # artefact is available in the cache. - # # size = number of unique components created (each in its own env) - # # length = seconds to wait per component for pre-compilation warm-up - # # NOTE: if results here are close to the cache-disabled entries above, the - # # warm-up wait is too short and compilation hasn't finished — bump length. - # - name: cold-start-unknown-small - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 20] - # length: [2] + # Cold-start: compilation cache enabled — measures latency once the compiled + # artefact is available in the cache. + # size = number of unique components created (each in its own env) + # length = seconds to wait per component for pre-compilation warm-up + # NOTE: if results here are close to the cache-disabled entries above, the + # warm-up wait is too short and compilation hasn't finished — bump length. + - name: cold-start-unknown-small + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [2] - # - name: cold-start-unknown-medium - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 20] - # length: [5] + - name: cold-start-unknown-medium + iterations: 3 + clusterSize: [2] + size: [1, 5, 10, 20] + length: [5] - # # Invocation latency — hot and cold paths through the Gateway NLB. - # # Large worker counts to stress the load balancer and connection pool. - # # size = number of workers created - # # length = number of hot invocations per worker after the first cold one - # - name: latency-small - # iterations: 3 - # clusterSize: [2] - # size: [100, 500, 1000, 2000, 5000] - # length: [2] + # Invocation latency — hot and cold paths through the Gateway NLB. + # Large worker counts to stress the load balancer and connection pool. + # size = number of workers created + # length = number of hot invocations per worker after the first cold one + - name: latency-small + iterations: 3 + clusterSize: [2] + size: [100, 500, 1000, 2000, 5000] + length: [2] - # - name: latency-medium - # iterations: 3 - # clusterSize: [2] - # size: [100, 500, 1000, 2000] - # length: [5] + - name: latency-medium + iterations: 3 + clusterSize: [2] + size: [100, 500, 1000, 2000] + length: [5] - # # Sleep — measures worker suspension and resumption under real network - # # conditions. High residency: all `size` workers held in memory sleeping at - # # once, so this also probes how many resident workers fit (memory-admission - # # relevant) — pushed past the ~2000 echo proved out. - # # size = number of workers launched in parallel - # # length = sleep duration in milliseconds - # - name: sleep - # iterations: 3 - # clusterSize: [2] - # size: [10, 100, 500, 1000, 2000] - # length: [10000] + # Sleep — measures worker suspension and resumption under real network + # conditions. High residency: all `size` workers held in memory sleeping at + # once, so this also probes how many resident workers fit (memory-admission + # relevant) — pushed past the ~2000 echo proved out. + # size = number of workers launched in parallel + # length = sleep duration in milliseconds + - name: sleep + iterations: 3 + clusterSize: [2] + size: [10, 100, 500, 1000, 2000] + length: [10000] - # # Durability overhead — measures the cost of durable vs ephemeral execution - # # across four variants (durable-persistent, durable-non-persistent, - # # ephemeral, durable-persistent-commit). size workers concurrent per phase; - # # sized up to put real load on the oplog/persistence/storage path. - # # size = number of workers per variant - # # length = loop iteration count passed to oplog_heavy - # - name: durability-overhead - # iterations: 3 - # clusterSize: [2] - # size: [10, 50, 100, 250] - # length: [5000] + # Durability overhead — measures the cost of durable vs ephemeral execution + # across four variants (durable-persistent, durable-non-persistent, + # ephemeral, durable-persistent-commit). size workers concurrent per phase; + # sized up to put real load on the oplog/persistence/storage path. + # size = number of workers per variant + # length = loop iteration count passed to oplog_heavy + - name: durability-overhead + iterations: 3 + clusterSize: [2] + size: [10, 50, 100, 250] + length: [5000] From 9e582a2603cbbe1b09697a61f4d0e29ed08087a0 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sun, 7 Jun 2026 21:13:29 -0700 Subject: [PATCH 19/60] feat(benchmark): increase max number of concurrent compilations --- integration-tests/benchmark_suites/cloud-perf.yaml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index 81ff01bfc5..d508c5b5fc 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -57,22 +57,17 @@ benchmarks: # with no warm compiled artefact available. # size = number of unique components created (each in its own env) # length = seconds to wait per component for pre-compilation warm-up - # NOTE: each unit here is a UNIQUE component, so size scales compilations - # (compilation-service + S3 cache load), not worker density. Kept at max 20: - # cold-start latency is already characterized there, and the cache-enabled - # warmup sleeps length*size seconds per run, so larger sizes mostly add idle - # wait rather than signal. - name: cold-start-unknown-small iterations: 3 clusterSize: [2] - size: [1, 5, 10, 20] + size: [1, 5, 10, 25, 50] length: [2] disableCompilationCache: true - name: cold-start-unknown-medium iterations: 3 clusterSize: [2] - size: [1, 5, 10, 20] + size: [1, 5, 10, 25, 50] length: [5] disableCompilationCache: true @@ -85,13 +80,13 @@ benchmarks: - name: cold-start-unknown-small iterations: 3 clusterSize: [2] - size: [1, 5, 10, 20] + size: [1, 5, 10, 25, 50] length: [2] - name: cold-start-unknown-medium iterations: 3 clusterSize: [2] - size: [1, 5, 10, 20] + size: [1, 5, 10, 25, 50] length: [5] # Invocation latency — hot and cold paths through the Gateway NLB. From e7b44bf3e5e2d83a3df3418d72c38219654aa40b Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Mon, 8 Jun 2026 16:53:27 -0700 Subject: [PATCH 20/60] feat(worker-executor): add measured-headroom memory admission gate --- .../config/debug-worker-executor.sample.env | 2 + .../config/debug-worker-executor.toml | 2 + .../config/worker-executor.sample.env | 3 + .../config/worker-executor.toml | 3 + .../services/active_workers/admission/mod.rs | 160 ++++++ .../active_workers/admission/tests.rs | 508 ++++++++++++++++++ .../services/active_workers/memory_probe.rs | 180 +++++++ .../src/services/active_workers/mod.rs | 190 ++++--- .../src/services/golem_config.rs | 21 + 9 files changed, 1006 insertions(+), 63 deletions(-) create mode 100644 golem-worker-executor/src/services/active_workers/admission/mod.rs create mode 100644 golem-worker-executor/src/services/active_workers/admission/tests.rs create mode 100644 golem-worker-executor/src/services/active_workers/memory_probe.rs diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env index 66afafc82a..d717cf777a 100644 --- a/golem-debugging-service/config/debug-worker-executor.sample.env +++ b/golem-debugging-service/config/debug-worker-executor.sample.env @@ -55,6 +55,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 @@ -229,6 +230,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml index 7191c36996..7d23b08cd5 100644 --- a/golem-debugging-service/config/debug-worker-executor.toml +++ b/golem-debugging-service/config/debug-worker-executor.toml @@ -96,6 +96,7 @@ max_oplog_query_pages_size = 100 [memory] acquire_retry_delay = "500ms" +admission_reserve_bytes = 268435456 component_size_coefficient = 2.0 worker_estimate_coefficient = 1.1 worker_memory_ratio = 0.8 @@ -365,6 +366,7 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" +# admission_reserve_bytes = 268435456 # component_size_coefficient = 2.0 # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env index 2ef7701cc5..dc33d7b3c1 100644 --- a/golem-worker-executor/config/worker-executor.sample.env +++ b/golem-worker-executor/config/worker-executor.sample.env @@ -72,6 +72,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 @@ -292,6 +293,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 @@ -482,6 +484,7 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml index b1bab39be9..265ec5f904 100644 --- a/golem-worker-executor/config/worker-executor.toml +++ b/golem-worker-executor/config/worker-executor.toml @@ -125,6 +125,7 @@ max_oplog_query_pages_size = 100 [memory] acquire_retry_delay = "500ms" +admission_reserve_bytes = 268435456 component_size_coefficient = 2.0 worker_estimate_coefficient = 1.1 worker_memory_ratio = 0.8 @@ -457,6 +458,7 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" +# admission_reserve_bytes = 268435456 # component_size_coefficient = 2.0 # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 @@ -759,6 +761,7 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" +# admission_reserve_bytes = 268435456 # component_size_coefficient = 2.0 # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs new file mode 100644 index 0000000000..702dc003e7 --- /dev/null +++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs @@ -0,0 +1,160 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Measured-headroom admission decision. +//! +//! Gates worker admission on the executor environment's *real* memory headroom +//! read from the [`MemoryProbe`], rather than on the estimate-based semaphore in +//! [`super::ActiveWorkers`]. The two work together: the semaphore is a cheap, +//! high-frequency pre-filter over reserved-but-not-yet-resident intent; this +//! controller is the authoritative check against measured resident usage. When +//! headroom is short it evicts already-resident idle-then-warm work; if it still +//! cannot make room it rejects rather than over-committing. +//! +//! The controller is decoupled from `Worker`/wasmtime via the [`EvictionSource`] +//! trait so its decision logic can be exercised in isolation with synthetic +//! probes and candidate sets. + +use super::memory_probe::MemoryProbe; +use async_trait::async_trait; + +/// Why an eviction candidate is worth evicting, in priority order. Lower +/// variants are evicted first. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub enum EvictionPriority { + /// Resident in memory, not executing, no durable pending work. Cheapest to + /// evict — losing it costs at most a re-load on next use. + Idle, + /// Resident in memory, not executing, but has durable pending work. Evicted + /// only after all idle candidates are exhausted. + Warm, +} + +/// A source of evictable, already-resident memory the controller can reclaim to +/// restore headroom. Abstracts over the live worker set so the decision logic +/// is testable without `Worker`/wasmtime. +#[async_trait] +pub trait EvictionSource: Send + Sync { + /// Evict at the given priority tier, attempting to free at least + /// `needed_bytes`. Returns the number of bytes actually reclaimed (which may + /// be less if the tier is exhausted, or more if a single victim was larger + /// than needed). Must not evict from a higher (more expensive) tier than the + /// one requested. + async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64; +} + +/// The outcome of an admission attempt. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AdmissionDecision { + /// There is enough real headroom (possibly after eviction) to admit the + /// request without risking the limit. + Admit, + /// Not enough headroom could be freed; the request must back off rather + /// than over-commit. + Reject, +} + +/// Configuration for the headroom-based admission decision. +/// +/// Two knobs with distinct jobs: +/// +/// * `usable_ratio` — fraction of the measured limit usable for WASM admission. +/// The remainder is left for the host (the executor process, allocator +/// arenas, runtime buffers). Mirrors `worker_memory_ratio`, but applied to the +/// measured limit rather than the configured total. +/// +/// * `reserve_bytes` — margin kept free below the carve-out ceiling to absorb +/// the window in which concurrent admissions are observed before becoming +/// resident. Its sufficiency under concurrency is asserted by the property +/// test in `tests.rs`. +#[derive(Debug, Clone, Copy)] +pub struct AdmissionPolicy { + /// Fraction (0.0..=1.0) of the measured limit usable for WASM admission. + pub usable_ratio: f64, + /// Dynamic safety margin kept free below the carve-out ceiling. + pub reserve_bytes: u64, +} + +/// Decides admission against measured headroom, evicting resident idle/warm +/// work as needed. Holds only its policy and probe; live state is read fresh +/// from the probe and the eviction source on each call (never cached). +pub struct AdmissionController { + probe: Box, + policy: AdmissionPolicy, +} + +impl AdmissionController { + pub fn new(probe: Box, policy: AdmissionPolicy) -> Self { + Self { probe, policy } + } + + /// Bytes available for new admissions: the carve-out ceiling + /// (`usable_ratio × limit`) minus current usage minus the reserve. + /// Saturating throughout — never underflows when already over a ceiling. + fn admissible_headroom(&self) -> u64 { + let snapshot = self.probe.snapshot(); + let ceiling = (snapshot.limit_bytes as f64 * self.policy.usable_ratio) as u64; + ceiling + .saturating_sub(snapshot.current_bytes) + .saturating_sub(self.policy.reserve_bytes) + } + + /// Decide whether `request_bytes` can be admitted, evicting from `source` if + /// the current headroom is insufficient. + /// + /// Eviction is attempted idle-first, then warm, and only up to the shortfall + /// (never evicts when headroom already suffices). After eviction the + /// headroom is re-measured against ground truth; the request is admitted only + /// if the real headroom now covers it, otherwise it is rejected. + pub async fn try_admit( + &self, + request_bytes: u64, + source: &dyn EvictionSource, + ) -> AdmissionDecision { + // Fast path: enough real headroom already, admit without evicting. + if self.admissible_headroom() >= request_bytes { + return AdmissionDecision::Admit; + } + + // Reclaim resident, idle-then-warm work up to the shortfall. + let shortfall = request_bytes.saturating_sub(self.admissible_headroom()); + let mut remaining = shortfall; + + for priority in [EvictionPriority::Idle, EvictionPriority::Warm] { + if remaining == 0 { + break; + } + let freed = source.evict_at_most(priority, remaining).await; + remaining = remaining.saturating_sub(freed); + } + + // Re-measure against ground truth rather than trusting the freed tally: + // the probe is the authority, and other activity may have moved usage + // in either direction while we were evicting. + if self.admissible_headroom() >= request_bytes { + AdmissionDecision::Admit + } else { + AdmissionDecision::Reject + } + } + + /// The current admissible headroom. Exposed for metrics and for callers that + /// want to make their own pre-check. + pub fn headroom_bytes(&self) -> u64 { + self.admissible_headroom() + } +} + +#[cfg(test)] +mod tests; diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs new file mode 100644 index 0000000000..bd9b51aabb --- /dev/null +++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs @@ -0,0 +1,508 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Property-based and example tests for the measured-headroom admission valve. +//! +//! These tests model an executor environment as a shared cell holding a hard +//! `limit`, the current resident `usage`, and the set of resident evictable +//! work (each item carrying a size and an eviction priority). A [`FakeProbe`] +//! reports `usage`/`limit` from the cell; a [`FakeEvictionSource`] reclaims +//! idle-then-warm items and decrements `usage`. Admitting a request adds its +//! size to `usage` as a new resident, non-evictable item (it is actively being +//! created). +//! +//! The model lets `proptest` drive thousands of random admit sequences — with +//! random request sizes, pre-resident work, and limits — and assert the +//! invariants that *define* a correct safety valve: +//! +//! 1. Safety: usage never exceeds the limit (the environment never OOMs). +//! 2. No spurious eviction: when headroom is ample, nothing is evicted. +//! 3. Eviction ordering: idle work is reclaimed before warm work. +//! 4. Clean rejection: when the request genuinely cannot fit, the decision is +//! `Reject` and no over-commit happens. + +use super::*; +use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot}; +use proptest::prelude::*; +use std::sync::{Arc, Mutex}; +use test_r::test; + +test_r::enable!(); + +/// One unit of resident, evictable work in the model. +#[derive(Debug, Clone, Copy)] +struct Resident { + size: u64, + priority: EvictionPriority, +} + +/// Shared model of the executor environment's memory. +#[derive(Debug, Default)] +struct EnvState { + limit: u64, + /// Resident bytes attributed to admitted, currently-active requests that + /// are not yet evictable (they are mid-admission). + pinned_usage: u64, + /// Resident, evictable work — what the controller may reclaim. + residents: Vec, + /// Count of evictions performed, for the no-spurious-eviction property. + evictions: usize, + /// The priorities evicted, in order, for the ordering property. + eviction_order: Vec, +} + +impl EnvState { + fn usage(&self) -> u64 { + self.pinned_usage + self.residents.iter().map(|r| r.size).sum::() + } +} + +#[derive(Debug, Clone)] +struct FakeProbe { + state: Arc>, +} + +impl MemoryProbe for FakeProbe { + fn snapshot(&self) -> MemorySnapshot { + let state = self.state.lock().unwrap(); + MemorySnapshot { + limit_bytes: state.limit, + current_bytes: state.usage(), + } + } +} + +struct FakeEvictionSource { + state: Arc>, +} + +#[async_trait::async_trait] +impl EvictionSource for FakeEvictionSource { + async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64 { + let mut state = self.state.lock().unwrap(); + let mut freed = 0u64; + // Evict only at the requested tier, oldest-first (model: vec order), + // until we have freed at least `needed_bytes` or the tier is empty. + let mut i = 0; + while freed < needed_bytes && i < state.residents.len() { + if state.residents[i].priority == priority { + let victim = state.residents.remove(i); + freed += victim.size; + state.evictions += 1; + state.eviction_order.push(priority); + } else { + i += 1; + } + } + freed + } +} + +fn controller(state: Arc>, reserve_bytes: u64) -> AdmissionController { + controller_with_ratio(state, 1.0, reserve_bytes) +} + +fn controller_with_ratio( + state: Arc>, + usable_ratio: f64, + reserve_bytes: u64, +) -> AdmissionController { + AdmissionController::new( + Box::new(FakeProbe { + state: state.clone(), + }), + AdmissionPolicy { + usable_ratio, + reserve_bytes, + }, + ) +} + +/// Apply one admission attempt against the model, mutating `usage` on admit. +async fn apply_admit( + controller: &AdmissionController, + source: &FakeEvictionSource, + state: &Arc>, + request: u64, +) -> AdmissionDecision { + let decision = controller.try_admit(request, source).await; + if decision == AdmissionDecision::Admit { + state.lock().unwrap().pinned_usage += request; + } + decision +} + +// ── Single-case unit tests ─────────────────────────────────────────────────── + +#[test] +async fn admits_when_headroom_is_ample_without_evicting() { + let state = Arc::new(Mutex::new(EnvState { + limit: 1000, + pinned_usage: 0, + residents: vec![Resident { + size: 100, + priority: EvictionPriority::Idle, + }], + ..Default::default() + })); + let ctrl = controller(state.clone(), 0); + let source = FakeEvictionSource { + state: state.clone(), + }; + + let decision = apply_admit(&ctrl, &source, &state, 200).await; + assert_eq!(decision, AdmissionDecision::Admit); + // Nothing should have been evicted — there was plenty of headroom. + assert_eq!(state.lock().unwrap().evictions, 0); +} + +#[test] +async fn evicts_idle_before_warm() { + let state = Arc::new(Mutex::new(EnvState { + limit: 1000, + pinned_usage: 0, + residents: vec![ + Resident { + size: 400, + priority: EvictionPriority::Warm, + }, + Resident { + size: 400, + priority: EvictionPriority::Idle, + }, + ], + ..Default::default() + })); + // usage = 800, limit = 1000, headroom = 200. Request 300 → shortfall 100. + // One idle (400) covers it; warm must remain untouched. + let ctrl = controller(state.clone(), 0); + let source = FakeEvictionSource { + state: state.clone(), + }; + + let decision = apply_admit(&ctrl, &source, &state, 300).await; + assert_eq!(decision, AdmissionDecision::Admit); + + let s = state.lock().unwrap(); + assert_eq!(s.eviction_order, vec![EvictionPriority::Idle]); + assert!(s.usage() <= s.limit); +} + +#[test] +async fn rejects_when_nothing_can_be_freed() { + let state = Arc::new(Mutex::new(EnvState { + limit: 1000, + // All usage is pinned (mid-admission), nothing evictable. + pinned_usage: 950, + residents: vec![], + ..Default::default() + })); + let ctrl = controller(state.clone(), 0); + let source = FakeEvictionSource { + state: state.clone(), + }; + + let decision = apply_admit(&ctrl, &source, &state, 200).await; + assert_eq!(decision, AdmissionDecision::Reject); + // No over-commit: usage unchanged. + assert_eq!(state.lock().unwrap().usage(), 950); +} + +#[test] +async fn reserve_is_kept_free() { + let state = Arc::new(Mutex::new(EnvState { + limit: 1000, + pinned_usage: 700, + residents: vec![], + ..Default::default() + })); + // headroom = 300, reserve = 200 → admissible = 100. Request 150 → reject. + let ctrl = controller(state.clone(), 200); + let source = FakeEvictionSource { + state: state.clone(), + }; + + assert_eq!( + apply_admit(&ctrl, &source, &state, 150).await, + AdmissionDecision::Reject + ); + // But a request within the admissible window succeeds. + assert_eq!( + apply_admit(&ctrl, &source, &state, 100).await, + AdmissionDecision::Admit + ); +} + +// ── Property tests ─────────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +enum Op { + Admit(u64), +} + +fn arb_resident_priority() -> impl Strategy { + prop_oneof![Just(EvictionPriority::Idle), Just(EvictionPriority::Warm)] +} + +fn arb_ops() -> impl Strategy> { + prop::collection::vec((1u64..800).prop_map(Op::Admit), 0..40) +} + +/// Strategy yielding a `(limit, residents)` start state where the residents fit +/// under the limit by construction, by carving each resident's size out of a +/// remaining budget. A resident set exceeding the limit cannot occur in reality +/// (it would already have been OOM-killed), so it is not a valid start state. +fn arb_fitting_state( + limit_range: std::ops::Range, + max_residents: usize, +) -> impl Strategy)> { + limit_range.prop_flat_map(move |limit| { + // Reserve a fraction of the limit for residents (0..=80%) so there is + // usually some free headroom in the start state too. Each resident then + // takes a slice of that budget. + ( + Just(limit), + (0u64..=(limit * 4 / 5)), + prop::collection::vec((1u64..=1000, arb_resident_priority()), 0..max_residents), + ) + .prop_map(|(limit, mut budget, raw)| { + let mut residents = Vec::new(); + for (weight, priority) in raw { + if budget == 0 { + break; + } + // Each resident is at most a third of the remaining budget, + // so several can coexist; clamp to whatever budget is left. + let size = weight.min(budget.div_ceil(3)).max(1).min(budget); + residents.push(Resident { size, priority }); + budget -= size; + } + (limit, residents) + }) + }) +} + +proptest! { + /// Safety invariant: across any random sequence of admits — with random + /// pre-resident work, random sizes, and a random reserve — modeled usage + /// must never exceed the limit. This is the property that rules out OOM. + #[test] + fn usage_never_exceeds_limit( + (limit, residents) in arb_fitting_state(500..5000, 20), + reserve in 0u64..300, + ops in arb_ops(), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { + limit, + pinned_usage: 0, + residents, + ..Default::default() + })); + let ctrl = controller(state.clone(), reserve); + let source = FakeEvictionSource { state: state.clone() }; + + for op in ops { + match op { + Op::Admit(req) => { + apply_admit(&ctrl, &source, &state, req).await; + let s = state.lock().unwrap(); + prop_assert!( + s.usage() <= s.limit, + "usage {} exceeded limit {}", s.usage(), s.limit + ); + } + } + } + Ok(()) + }).unwrap(); + } + + /// No spurious eviction: if every admit in the sequence fits within the + /// admissible headroom at the moment it is issued, nothing is ever evicted. + /// We guarantee the precondition by giving a huge limit and small requests. + #[test] + fn no_eviction_when_headroom_ample( + residents in prop::collection::vec( + (1u64..500, arb_resident_priority()) + .prop_map(|(size, priority)| Resident { size, priority }), + 0..20, + ), + ops in prop::collection::vec((1u64..50).prop_map(Op::Admit), 0..30), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { + limit: 1_000_000, + pinned_usage: 0, + residents, + ..Default::default() + })); + let ctrl = controller(state.clone(), 0); + let source = FakeEvictionSource { state: state.clone() }; + + for op in ops { + match op { + Op::Admit(req) => { apply_admit(&ctrl, &source, &state, req).await; } + } + } + prop_assert_eq!(state.lock().unwrap().evictions, 0); + Ok(()) + }).unwrap(); + } + + /// Eviction ordering: whenever eviction happens, no warm item is evicted + /// while an idle item was still available to evict at that step. We check + /// the weaker, order-level invariant that the recorded eviction order never + /// has a warm eviction before an idle one within a single `try_admit` call + /// — i.e. idle is always drained first. + #[test] + fn idle_evicted_before_warm( + (limit, residents) in arb_fitting_state(500..3000, 25), + ops in prop::collection::vec((1u64..1500).prop_map(Op::Admit), 1..20), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { + limit, + pinned_usage: 0, + residents, + ..Default::default() + })); + let ctrl = controller(state.clone(), 0); + let source = FakeEvictionSource { state: state.clone() }; + + for op in ops { + match op { + Op::Admit(req) => { apply_admit(&ctrl, &source, &state, req).await; } + } + } + + // Once a warm eviction appears in the order, an idle eviction must + // never follow it (idle is always exhausted first). + let order = state.lock().unwrap().eviction_order.clone(); + let mut seen_warm = false; + for p in order { + match p { + EvictionPriority::Warm => seen_warm = true, + EvictionPriority::Idle => prop_assert!( + !seen_warm, + "idle eviction followed a warm eviction" + ), + } + } + Ok(()) + }).unwrap(); + } +} + +// ── Carve-out ratio ────────────────────────────────────────────────────────── + +#[test] +async fn usable_ratio_caps_admission_below_full_limit() { + let state = Arc::new(Mutex::new(EnvState { + limit: 1000, + pinned_usage: 0, + residents: vec![], + ..Default::default() + })); + // ceiling = 0.8 * 1000 = 800. Request 850 must be rejected even though the + // raw limit (1000) would allow it — the top 20% is reserved for the host. + let ctrl = controller_with_ratio(state.clone(), 0.8, 0); + let source = FakeEvictionSource { + state: state.clone(), + }; + + assert_eq!( + apply_admit(&ctrl, &source, &state, 850).await, + AdmissionDecision::Reject + ); + assert_eq!( + apply_admit(&ctrl, &source, &state, 800).await, + AdmissionDecision::Admit + ); +} + +// ── Concurrency: the simultaneous-big-start race ───────────────────────────── + +proptest! { + /// The contract for the safety invariant under concurrency. + /// + /// Many admissions race at once with no external serialisation across the + /// headroom check and the commit (the commit models the upstream atomic + /// permit grant; the check is a separate prior read, so a genuine + /// time-of-check/time-of-use window exists between concurrent tasks). + /// + /// The invariant: real usage must never exceed the true `limit`. Admissions + /// may collectively overshoot the carve-out ceiling into the reserve — that + /// is what the reserve is for — but never past `limit` itself. The reserve + /// is sized here to cover the worst-case concurrent overshoot (number of + /// racers × max request), so a passing test means the reserve margin is a + /// sufficient substitute for serialising the gate. If this ever fails, the + /// margin is insufficient for the chosen concurrency and the gate's + /// correctness depends on stronger synchronisation. + #[test] + fn concurrent_admissions_never_exceed_limit( + racers in 2usize..16, + request in 50u64..400, + ) { + // Worst case: every racer passes the check against the same snapshot and + // commits. The reserve must cover (racers - 1) extra in-flight requests + // beyond the one the headroom was actually sized for. + let reserve = request * racers as u64; + // Ceiling must leave room for at least one request above the reserve. + let limit = reserve + request + 1000; + + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .build() + .unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { + limit, + pinned_usage: 0, + residents: vec![], + ..Default::default() + })); + let ctrl = Arc::new(controller_with_ratio(state.clone(), 1.0, reserve)); + + let mut handles = Vec::new(); + for _ in 0..racers { + let ctrl = ctrl.clone(); + let state = state.clone(); + handles.push(tokio::spawn(async move { + let source = FakeEvictionSource { state: state.clone() }; + let decision = ctrl.try_admit(request, &source).await; + if decision == AdmissionDecision::Admit { + // Models the atomic permit grant: a single locked + // fetch-add, separate from the (already-completed) check. + state.lock().unwrap().pinned_usage += request; + } + })); + } + for h in handles { + h.await.unwrap(); + } + + let s = state.lock().unwrap(); + prop_assert!( + s.usage() <= s.limit, + "concurrent admissions drove usage {} past limit {}", + s.usage(), s.limit + ); + Ok(()) + }).unwrap(); + } +} diff --git a/golem-worker-executor/src/services/active_workers/memory_probe.rs b/golem-worker-executor/src/services/active_workers/memory_probe.rs new file mode 100644 index 0000000000..0d1c4088a3 --- /dev/null +++ b/golem-worker-executor/src/services/active_workers/memory_probe.rs @@ -0,0 +1,180 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Platform-abstracted probe of the executor's real memory usage and limit. +//! +//! Reports the measured resident memory and hard limit of the process's +//! environment, used as the authoritative input to admission decisions (in +//! contrast to the estimate-based semaphore in [`super::ActiveWorkers`]). +//! +//! The trait is abstract over where the limit comes from: a containerised Linux +//! deployment reads it from the cgroup, an unconstrained process reads host RAM, +//! a configured override pins it explicitly. Backend fidelity is asymmetric — +//! cgroup v2 gives the exact kernel-enforced number; other targets fall back to +//! best-effort process RSS via [`ProcessRssProbe`] until dedicated macOS and +//! Windows backends land. + +use std::fmt::Debug; + +/// A snapshot of the executor environment's memory state. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct MemorySnapshot { + /// Hard ceiling: cgroup `memory.max` on constrained Linux, configured cap + /// or host RAM otherwise. Reaching this with `current` triggers an + /// OOM-kill. + pub limit_bytes: u64, + /// Currently-resident bytes: cgroup `memory.current` on Linux (touched + /// pages, lagging but exact), process RSS otherwise. + pub current_bytes: u64, +} + +impl MemorySnapshot { + /// Bytes between current usage and the hard limit. Saturating: never + /// underflows if `current` momentarily exceeds the reported `limit`. + pub fn headroom_bytes(&self) -> u64 { + self.limit_bytes.saturating_sub(self.current_bytes) + } +} + +/// Reads the executor environment's real memory state. Cheap enough to sample +/// at admission time, but not on every wasmtime `memory.grow` (that is what the +/// estimate-semaphore pre-check absorbs). +pub trait MemoryProbe: Send + Sync + Debug { + fn snapshot(&self) -> MemorySnapshot; + + fn limit_bytes(&self) -> u64 { + self.snapshot().limit_bytes + } + + fn current_bytes(&self) -> u64 { + self.snapshot().current_bytes + } + + fn headroom_bytes(&self) -> u64 { + self.snapshot().headroom_bytes() + } +} + +/// A probe whose limit is fixed at construction and whose current usage comes +/// from cross-platform process RSS via `sysinfo`. +/// +/// This is the best-effort fallback used wherever no higher-fidelity backend +/// is available yet (notably macOS and Windows). It is also used when a +/// `system_memory_override` pins the limit explicitly. +#[derive(Debug)] +pub struct ProcessRssProbe { + limit_bytes: u64, +} + +impl ProcessRssProbe { + pub fn new(limit_bytes: u64) -> Self { + Self { limit_bytes } + } + + fn current_rss() -> u64 { + let mut sysinfo = sysinfo::System::new(); + let pid = sysinfo::Pid::from_u32(std::process::id()); + sysinfo.refresh_processes(sysinfo::ProcessesToUpdate::Some(&[pid]), true); + sysinfo.process(pid).map(|p| p.memory()).unwrap_or_default() + } +} + +impl MemoryProbe for ProcessRssProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.limit_bytes, + current_bytes: Self::current_rss(), + } + } +} + +/// Linux cgroup v2 probe. Reads `memory.max` and `memory.current` from the +/// process's cgroup. +#[cfg(target_os = "linux")] +#[derive(Debug)] +pub struct CgroupV2Probe { + /// Resolved path to the cgroup directory, e.g. `/sys/fs/cgroup`. + base: std::path::PathBuf, + /// Fallback limit used when `memory.max` reads `max` (unlimited) — usually + /// host RAM or the configured override. + fallback_limit_bytes: u64, +} + +#[cfg(target_os = "linux")] +impl CgroupV2Probe { + const DEFAULT_BASE: &'static str = "/sys/fs/cgroup"; + + /// Attempts to construct a cgroup v2 probe. Returns `None` when the host is + /// not running cgroup v2 (no unified `memory.current` at the base path), so + /// the caller can fall back to [`ProcessRssProbe`]. + pub fn try_new(fallback_limit_bytes: u64) -> Option { + let base = std::path::PathBuf::from(Self::DEFAULT_BASE); + // cgroup v2 unified hierarchy exposes memory.current directly at the + // delegated cgroup path. If it is not readable we are not on v2. + if std::fs::read_to_string(base.join("memory.current")).is_ok() { + Some(Self { + base, + fallback_limit_bytes, + }) + } else { + None + } + } + + fn read_u64(&self, file: &str) -> Option { + let raw = std::fs::read_to_string(self.base.join(file)).ok()?; + raw.trim().parse::().ok() + } + + fn read_limit(&self) -> u64 { + // memory.max contains either a number of bytes or the literal "max". + match std::fs::read_to_string(self.base.join("memory.max")) { + Ok(raw) => { + let trimmed = raw.trim(); + if trimmed == "max" { + self.fallback_limit_bytes + } else { + trimmed.parse::().unwrap_or(self.fallback_limit_bytes) + } + } + Err(_) => self.fallback_limit_bytes, + } + } +} + +#[cfg(target_os = "linux")] +impl MemoryProbe for CgroupV2Probe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.read_limit(), + current_bytes: self.read_u64("memory.current").unwrap_or(0), + } + } +} + +/// Constructs the best available probe for the current platform. +/// +/// On Linux, prefers cgroup v2; falls back to process RSS. On other targets, +/// uses process RSS until dedicated backends land. `limit_bytes` is the limit +/// to charge against and is also the fallback when the cgroup reports an +/// unlimited `memory.max`. +pub fn default_probe(limit_bytes: u64) -> Box { + #[cfg(target_os = "linux")] + { + if let Some(probe) = CgroupV2Probe::try_new(limit_bytes) { + return Box::new(probe); + } + } + Box::new(ProcessRssProbe::new(limit_bytes)) +} diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs index 3a9ece958b..00a3fc6f4d 100644 --- a/golem-worker-executor/src/services/active_workers/mod.rs +++ b/golem-worker-executor/src/services/active_workers/mod.rs @@ -12,9 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod admission; pub mod concurrent_agents_scheduler; pub mod concurrent_agents_semaphore; pub mod fs_semaphore; +pub mod memory_probe; #[cfg(test)] mod tests; @@ -26,6 +28,9 @@ pub use fs_semaphore::{ filesystem_storage_permits_to_bytes, filesystem_storage_pool_bytes_to_permits, }; +use admission::{AdmissionController, AdmissionDecision, EvictionPriority, EvictionSource}; +use async_trait::async_trait; +use memory_probe::default_probe; use std::sync::Arc; use std::time::Duration; use tokio::sync::{Mutex, OwnedSemaphorePermit, Semaphore, TryAcquireError}; @@ -70,6 +75,11 @@ pub struct ActiveWorkers { concurrent_agents: Arc, priority_allocation_lock: Arc>, acquire_retry_delay: Duration, + /// Authoritative measured-headroom admission gate. Decides whether real + /// memory headroom permits a new acquisition, evicting via the worker set + /// when short. The estimate-based `worker_memory` semaphore is the cheap + /// pre-filter and atomic commit in front of it. + admission: AdmissionController, } #[derive(Debug)] @@ -110,6 +120,10 @@ impl Drop for WorkerMemoryPermit { impl ActiveWorkers { pub fn new(memory_config: &MemoryConfig, storage_config: &FilesystemStorageConfig) -> Self { let worker_memory_size = memory_config.worker_memory(); + let admission = AdmissionController::new( + default_probe(memory_config.total_system_memory()), + memory_config.admission_policy(), + ); let active_workers = Self { workers: Cache::new( None, @@ -125,6 +139,7 @@ impl ActiveWorkers { concurrent_agents: Arc::new(ConcurrentAgentsScheduler::new()), acquire_retry_delay: memory_config.acquire_retry_delay, priority_allocation_lock: Arc::new(Mutex::new(())), + admission, }; active_workers.initialize_metrics(worker_memory_size); active_workers @@ -208,6 +223,20 @@ impl ActiveWorkers { .expect("requested memory size is too large"); loop { + // Authoritative measured-headroom gate. Evicts idle-then-warm when + // real headroom is short; rejects (and we back off) when it cannot + // make room rather than risking the limit. + if self + .admission + .try_admit(memory, &self.eviction_source()) + .await + == AdmissionDecision::Reject + { + debug!("Measured headroom insufficient for {mem32}, backing off and retrying"); + tokio::time::sleep(self.acquire_retry_delay).await; + continue; + } + let available = self.worker_memory.available_permits(); let lock = self.priority_allocation_lock.lock().await; // Block trying until a priority request is retrying once let result = self.worker_memory.clone().try_acquire_many_owned(mem32); @@ -249,10 +278,32 @@ impl ActiveWorkers { } } + /// Builds an [`EvictionSource`] view over the live worker set for the + /// admission controller to reclaim memory through. + fn eviction_source(&self) -> WorkerEvictionSource { + WorkerEvictionSource { + workers: self.workers.clone(), + } + } + pub async fn try_acquire(&self, memory: u64) -> Option { let mem32: u32 = memory .try_into() .expect("requested memory size is too large"); + + // Authoritative measured-headroom gate. Single attempt (this is the + // non-blocking path): if real headroom is insufficient even after + // eviction, do not admit. + if self + .admission + .try_admit(memory, &self.eviction_source()) + .await + == AdmissionDecision::Reject + { + debug!("Measured headroom insufficient for {mem32}, not admitting"); + return None; + } + let mut lock = None; loop { match self.worker_memory.clone().try_acquire_many_owned(mem32) { @@ -289,73 +340,23 @@ impl ActiveWorkers { let current_avail = self.worker_memory.available_permits(); let needed = memory.saturating_sub(current_avail as u64); - if needed > 0 { - let mut idle_candidates = Vec::new(); - let mut warm_candidates = Vec::new(); - - debug!("Collecting memory eviction candidates"); - let pairs = self.workers.iter().await; - for (agent_id, worker) in pairs { - if let Some(class) = worker.eviction_class().await - && let Ok(mem) = worker.memory_requirement().await - { - let last_changed = worker.last_execution_state_change(); - let entry = (agent_id, worker, mem, last_changed); - match class { - crate::worker::EvictionClass::LoadedIdle => { - idle_candidates.push(entry); - } - crate::worker::EvictionClass::WarmRunnable => { - warm_candidates.push(entry); - } - } - } - } - - // Sort each bucket by timestamp — newest first so we pop oldest - idle_candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis()); - idle_candidates.reverse(); - warm_candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis()); - warm_candidates.reverse(); - - let mut freed = 0u64; - - // First evict LoadedIdle workers (cheapest) - while freed < needed && !idle_candidates.is_empty() { - let (agent_id, worker, mem, _) = idle_candidates.pop().unwrap(); - debug!("Trying to stop idle {agent_id} to free up memory"); - if worker - .stop_if_evictable(crate::worker::EvictionClass::LoadedIdle) - .await - { - debug!("Stopped idle {agent_id} to free up {mem} memory"); - crate::metrics::workers::record_worker_eviction("LoadedIdle"); - freed += mem; - } - } + if needed == 0 { + debug!("Memory was freed up in the meantime"); + return true; + } - // Then evict WarmRunnable workers if still under pressure - while freed < needed && !warm_candidates.is_empty() { - let (agent_id, worker, mem, _) = warm_candidates.pop().unwrap(); - debug!("Trying to stop warm-runnable {agent_id} to free up memory"); - if worker - .stop_if_evictable(crate::worker::EvictionClass::WarmRunnable) - .await - { - debug!("Stopped warm-runnable {agent_id} to free up {mem} memory"); - crate::metrics::workers::record_worker_eviction("WarmRunnable"); - freed += mem; - } + let mut freed = 0u64; + for priority in [EvictionPriority::Idle, EvictionPriority::Warm] { + if freed >= needed { + break; } + freed += evict_at_most_memory(&self.workers, priority, needed - freed).await; + } - if freed > 0 { - debug!("Freed up {freed}"); - } - freed >= needed - } else { - debug!("Memory was freed up in the meantime"); - true + if freed > 0 { + debug!("Freed up {freed}"); } + freed >= needed } /// Blocking acquire of storage semaphore permits. Loops until the requested @@ -479,3 +480,66 @@ impl ActiveWorkers { crate::metrics::storage::record_worker_memory_pool_total(worker_memory_size as u64); } } + +impl From for crate::worker::EvictionClass { + fn from(priority: EvictionPriority) -> Self { + match priority { + EvictionPriority::Idle => crate::worker::EvictionClass::LoadedIdle, + EvictionPriority::Warm => crate::worker::EvictionClass::WarmRunnable, + } + } +} + +/// Evicts resident workers at a single priority tier, oldest-first, stopping +/// once at least `needed_bytes` have been freed or the tier is exhausted. +/// Returns the bytes actually reclaimed. +async fn evict_at_most_memory( + workers: &Cache>, WorkerExecutorError>, + priority: EvictionPriority, + needed_bytes: u64, +) -> u64 { + let target_class: crate::worker::EvictionClass = priority.into(); + + let mut candidates = Vec::new(); + for (agent_id, worker) in workers.iter().await { + if let Some(class) = worker.eviction_class().await + && class == target_class + && let Ok(mem) = worker.memory_requirement().await + { + let last_changed = worker.last_execution_state_change(); + candidates.push((agent_id, worker, mem, last_changed)); + } + } + + // Sort by timestamp newest-first so we pop the oldest first. + candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis()); + candidates.reverse(); + + let mut freed = 0u64; + while freed < needed_bytes && !candidates.is_empty() { + let (agent_id, worker, mem, _) = candidates.pop().unwrap(); + debug!("Trying to stop {target_class:?} {agent_id} to free up memory"); + if worker.stop_if_evictable(target_class).await { + debug!("Stopped {target_class:?} {agent_id} to free up {mem} memory"); + crate::metrics::workers::record_worker_eviction(match priority { + EvictionPriority::Idle => "LoadedIdle", + EvictionPriority::Warm => "WarmRunnable", + }); + freed += mem; + } + } + freed +} + +/// Adapts the live worker set to the [`EvictionSource`] the admission controller +/// drives. Holds a cheap clone of the worker cache handle. +struct WorkerEvictionSource { + workers: Cache>, WorkerExecutorError>, +} + +#[async_trait] +impl EvictionSource for WorkerEvictionSource { + async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64 { + evict_at_most_memory(&self.workers, priority, needed_bytes).await + } +} diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index 76b7720bf0..29ea514cb6 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -969,6 +969,10 @@ pub struct MemoryConfig { /// so this term over-accounts per-worker memory for large components. /// Lower this (e.g. to 0.0) to size permits primarily off linear memory. pub component_size_coefficient: f64, + /// Bytes of measured headroom kept free below the usable ceiling as a margin + /// against concurrent admissions overshooting before becoming resident. Used + /// by the measured-headroom admission gate. + pub admission_reserve_bytes: u64, #[serde(with = "humantime_serde")] pub acquire_retry_delay: Duration, pub oom_retry_config: RetryConfig, @@ -992,6 +996,17 @@ impl MemoryConfig { pub fn worker_memory(&self) -> usize { (self.total_system_memory() as f64 * self.worker_memory_ratio) as usize } + + /// The admission policy for the measured-headroom gate. Reuses + /// `worker_memory_ratio` as the usable fraction of the measured limit (the + /// host keeps the remainder) and `admission_reserve_bytes` as the concurrent + /// overshoot margin. + pub fn admission_policy(&self) -> crate::services::active_workers::admission::AdmissionPolicy { + crate::services::active_workers::admission::AdmissionPolicy { + usable_ratio: self.worker_memory_ratio, + reserve_bytes: self.admission_reserve_bytes, + } + } } impl SafeDisplay for MemoryConfig { @@ -1015,6 +1030,11 @@ impl SafeDisplay for MemoryConfig { "component size coefficient: {}", self.component_size_coefficient ); + let _ = writeln!( + &mut result, + "admission reserve bytes: {}", + self.admission_reserve_bytes + ); let _ = writeln!( &mut result, "acquire retry delay: {:?}", @@ -1540,6 +1560,7 @@ impl Default for MemoryConfig { worker_memory_ratio: 0.8, worker_estimate_coefficient: 1.1, component_size_coefficient: 2.0, + admission_reserve_bytes: 256 * 1024 * 1024, acquire_retry_delay: Duration::from_millis(500), oom_retry_config: RetryConfig { max_attempts: u32::MAX, From 817c6726821dfb9195a6e54424d0acc455819d57 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Mon, 8 Jun 2026 21:30:57 -0700 Subject: [PATCH 21/60] feat(worker-executor): charge component module size once per resident component --- .../active_workers/component_charge/mod.rs | 171 +++++++++++++ .../active_workers/component_charge/tests.rs | 206 ++++++++++++++++ .../src/services/active_workers/mod.rs | 230 ++++++++++++------ .../src/services/golem_config.rs | 7 +- golem-worker-executor/src/worker/mod.rs | 64 ++++- 5 files changed, 593 insertions(+), 85 deletions(-) create mode 100644 golem-worker-executor/src/services/active_workers/component_charge/mod.rs create mode 100644 golem-worker-executor/src/services/active_workers/component_charge/tests.rs diff --git a/golem-worker-executor/src/services/active_workers/component_charge/mod.rs b/golem-worker-executor/src/services/active_workers/component_charge/mod.rs new file mode 100644 index 0000000000..8ddd4aa8aa --- /dev/null +++ b/golem-worker-executor/src/services/active_workers/component_charge/mod.rs @@ -0,0 +1,171 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Per-component memory charge for the shared compiled module. +//! +//! A component's compiled module is loaded into the wasmtime engine once and +//! shared by every worker of that component, so its size must be charged to the +//! memory pool once per resident component rather than once per worker. This +//! registry tracks how many workers of each component are resident and holds a +//! single module-sized charge for as long as at least one is. +//! +//! The charge is represented by an opaque guard obtained from a [`ChargeSource`] +//! (the worker memory pool in production). The first resident worker of a +//! component acquires the charge; the last to unload drops it. The registry is +//! decoupled from the pool via [`ChargeSource`] so the refcounting can be +//! property-tested in isolation. + +use async_trait::async_trait; +use std::collections::HashMap; +use std::fmt::Debug; +use std::hash::Hash; +use std::sync::{Arc, Mutex}; + +/// Acquires an opaque, RAII charge of a given byte size from some pool. The +/// returned value releases the charge when dropped. +#[async_trait] +pub trait ChargeSource: Send + Sync { + type Charge: Send + Sync + 'static; + + async fn acquire_charge(&self, bytes: u64) -> Self::Charge; +} + +/// Tracks resident-worker refcounts per component key and holds one module-sized +/// charge per component while any worker of it is resident. +pub struct ComponentChargeRegistry { + source: S, + state: Mutex>>, +} + +struct Entry { + refcount: usize, + /// The held module charge. Always `Some` while `refcount > 0`. + charge: Option>, +} + +/// Handle representing one worker's residency of a component. While at least one +/// `ComponentChargeGuard` for a key is alive, the registry holds that +/// component's module charge. Dropping the last guard releases it. +pub struct ComponentChargeGuard +where + K: Eq + Hash + Clone + Send + 'static, +{ + registry: Arc>, + key: K, +} + +impl Debug for ComponentChargeGuard +where + K: Eq + Hash + Clone + Send + 'static, + S: ChargeSource, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ComponentChargeGuard").finish() + } +} + +/// Type-erased held component charge. A worker holds one of these for as long as +/// it is resident; dropping it releases the worker's residency of its component. +/// Erasing the source/key types lets non-generic holders store the guard. +pub trait HeldComponentCharge: Send + Sync + Debug {} + +impl HeldComponentCharge for ComponentChargeGuard +where + K: Eq + Hash + Clone + Send + Sync + 'static, + S: ChargeSource + 'static, + S::Charge: Sync, +{ +} + +impl ComponentChargeRegistry +where + K: Eq + Hash + Clone + Send + 'static, + S: ChargeSource, +{ + pub fn new(source: S) -> Arc { + Arc::new(Self { + source, + state: Mutex::new(HashMap::new()), + }) + } + + /// Register one resident worker of `key` (whose module is `charge_bytes`). + /// Acquires the module charge if this is the first resident worker of the + /// component. The returned guard releases residency on drop. + pub async fn acquire( + self: &Arc, + key: K, + charge_bytes: u64, + ) -> ComponentChargeGuard { + // Decide under the lock whether this caller is the one that must acquire + // the (possibly blocking) charge, so only the first resident worker of a + // component does so. Acquire the charge outside the lock, then publish it. + let must_acquire = { + let mut state = self.state.lock().unwrap(); + let entry = state.entry(key.clone()).or_insert(Entry { + refcount: 0, + charge: None, + }); + entry.refcount += 1; + entry.refcount == 1 + }; + + if must_acquire { + let charge = Arc::new(self.source.acquire_charge(charge_bytes).await); + let mut state = self.state.lock().unwrap(); + if let Some(entry) = state.get_mut(&key) { + // Only publish if still resident (refcount could have churned). + if entry.refcount > 0 && entry.charge.is_none() { + entry.charge = Some(charge); + } + } + } + + ComponentChargeGuard { + registry: self.clone(), + key, + } + } + + fn release(&self, key: &K) { + let mut state = self.state.lock().unwrap(); + if let Some(entry) = state.get_mut(key) { + entry.refcount = entry.refcount.saturating_sub(1); + if entry.refcount == 0 { + // Drop the held charge (returns it to the pool) and forget the + // component entirely. + state.remove(key); + } + } + } +} + +impl Drop for ComponentChargeGuard +where + K: Eq + Hash + Clone + Send + 'static, + S: ChargeSource, +{ + fn drop(&mut self) { + self.registry.release(&self.key); + } +} + +impl Debug for ComponentChargeRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ComponentChargeRegistry").finish() + } +} + +#[cfg(test)] +mod tests; diff --git a/golem-worker-executor/src/services/active_workers/component_charge/tests.rs b/golem-worker-executor/src/services/active_workers/component_charge/tests.rs new file mode 100644 index 0000000000..c58f1ab937 --- /dev/null +++ b/golem-worker-executor/src/services/active_workers/component_charge/tests.rs @@ -0,0 +1,206 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Tests for the per-component module charge registry. +//! +//! A [`FakeChargeSource`] models a pool by tracking total charged bytes in an +//! atomic; each charge it hands out decrements that total when dropped. The +//! tests then assert the registry's contract: a component's module is charged +//! exactly once while any worker of it is resident, released when the last +//! unloads, and never leaked or double-charged under concurrent churn. + +use super::*; +use proptest::prelude::*; +use std::sync::atomic::{AtomicU64, Ordering}; +use test_r::test; + +test_r::enable!(); + +/// A charge that returns `bytes` to the shared counter when dropped. +struct FakeCharge { + bytes: u64, + charged_total: Arc, +} + +impl Drop for FakeCharge { + fn drop(&mut self) { + self.charged_total.fetch_sub(self.bytes, Ordering::SeqCst); + } +} + +#[derive(Clone)] +struct FakeChargeSource { + charged_total: Arc, + /// Number of times a charge was actually acquired, to detect double-charge. + acquire_count: Arc, +} + +impl FakeChargeSource { + fn new() -> Self { + Self { + charged_total: Arc::new(AtomicU64::new(0)), + acquire_count: Arc::new(AtomicU64::new(0)), + } + } +} + +#[async_trait::async_trait] +impl ChargeSource for FakeChargeSource { + type Charge = FakeCharge; + + async fn acquire_charge(&self, bytes: u64) -> FakeCharge { + self.acquire_count.fetch_add(1, Ordering::SeqCst); + self.charged_total.fetch_add(bytes, Ordering::SeqCst); + FakeCharge { + bytes, + charged_total: self.charged_total.clone(), + } + } +} + +const MODULE_BYTES: u64 = 17 * 1024 * 1024; + +// ── Single-case unit tests ─────────────────────────────────────────────────── + +#[test] +async fn first_worker_charges_once_last_releases() { + let source = FakeChargeSource::new(); + let charged = source.charged_total.clone(); + let count = source.acquire_count.clone(); + let registry = ComponentChargeRegistry::new(source); + + let g1 = registry.acquire("comp-a", MODULE_BYTES).await; + assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES); + assert_eq!(count.load(Ordering::SeqCst), 1); + + // Second worker of the same component: no additional charge. + let g2 = registry.acquire("comp-a", MODULE_BYTES).await; + assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES); + assert_eq!(count.load(Ordering::SeqCst), 1); + + // Dropping one of two keeps the charge. + drop(g1); + assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES); + + // Dropping the last releases it. + drop(g2); + assert_eq!(charged.load(Ordering::SeqCst), 0); +} + +#[test] +async fn distinct_components_each_charge_once() { + let source = FakeChargeSource::new(); + let charged = source.charged_total.clone(); + let registry = ComponentChargeRegistry::new(source); + + let _a = registry.acquire("comp-a", MODULE_BYTES).await; + let _b = registry.acquire("comp-b", MODULE_BYTES).await; + let _b2 = registry.acquire("comp-b", MODULE_BYTES).await; + + // Two distinct components → charged twice, regardless of worker count. + assert_eq!(charged.load(Ordering::SeqCst), 2 * MODULE_BYTES); +} + +#[test] +async fn re_acquiring_after_full_release_charges_again() { + let source = FakeChargeSource::new(); + let charged = source.charged_total.clone(); + let count = source.acquire_count.clone(); + let registry = ComponentChargeRegistry::new(source); + + drop(registry.acquire("comp-a", MODULE_BYTES).await); + assert_eq!(charged.load(Ordering::SeqCst), 0); + + // A fresh residency after full release acquires the charge again. + let _g = registry.acquire("comp-a", MODULE_BYTES).await; + assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES); + assert_eq!(count.load(Ordering::SeqCst), 2); +} + +// ── Property tests ─────────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +enum Op { + /// Acquire a guard for component index `usize`. + Acquire(usize), + /// Drop the n-th currently-held guard (modulo number held). + Drop(usize), +} + +fn arb_ops(num_components: usize) -> impl Strategy> { + prop::collection::vec( + prop_oneof![ + (0..num_components).prop_map(Op::Acquire), + (0usize..100).prop_map(Op::Drop), + ], + 0..80, + ) +} + +proptest! { + /// The charged total always equals the sum of `MODULE_BYTES` over the distinct + /// components that currently have at least one held guard. This is the core + /// "once per resident component" contract: never per-worker, never leaked, + /// never double-charged. + #[test] + fn charge_tracks_distinct_resident_components( + num_components in 1usize..6, + ops in arb_ops(6), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let source = FakeChargeSource::new(); + let charged = source.charged_total.clone(); + let registry = ComponentChargeRegistry::new(source); + + // Held guards keyed by component index. + let mut held: Vec<(usize, ComponentChargeGuard<&'static str, FakeChargeSource>)> = + Vec::new(); + let keys: Vec<&'static str> = + ["c0", "c1", "c2", "c3", "c4", "c5"][..num_components].to_vec(); + + for op in ops { + match op { + Op::Acquire(i) => { + let i = i % num_components; + let guard = registry.acquire(keys[i], MODULE_BYTES).await; + held.push((i, guard)); + } + Op::Drop(n) => { + if !held.is_empty() { + let idx = n % held.len(); + held.remove(idx); + } + } + } + + // Distinct resident component count == charged_total / MODULE_BYTES. + let mut distinct: Vec = held.iter().map(|(i, _)| *i).collect(); + distinct.sort_unstable(); + distinct.dedup(); + let expected = distinct.len() as u64 * MODULE_BYTES; + prop_assert_eq!( + charged.load(Ordering::SeqCst), + expected, + "charged total did not match distinct resident components" + ); + } + + // After dropping everything, nothing remains charged. + drop(held); + prop_assert_eq!(charged.load(Ordering::SeqCst), 0); + Ok(()) + }).unwrap(); + } +} diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs index 00a3fc6f4d..34f8b190ec 100644 --- a/golem-worker-executor/src/services/active_workers/mod.rs +++ b/golem-worker-executor/src/services/active_workers/mod.rs @@ -13,6 +13,7 @@ // limitations under the License. pub mod admission; +pub mod component_charge; pub mod concurrent_agents_scheduler; pub mod concurrent_agents_semaphore; pub mod fs_semaphore; @@ -30,6 +31,8 @@ pub use fs_semaphore::{ use admission::{AdmissionController, AdmissionDecision, EvictionPriority, EvictionSource}; use async_trait::async_trait; +pub use component_charge::HeldComponentCharge; +use component_charge::{ChargeSource, ComponentChargeGuard, ComponentChargeRegistry}; use memory_probe::default_probe; use std::sync::Arc; use std::time::Duration; @@ -45,7 +48,7 @@ use crate::workerctx::WorkerCtx; use golem_common::cache::{BackgroundEvictionMode, Cache, FullCacheEvictionMode, SimpleCache}; use golem_common::model::account::AccountId; use golem_common::model::agent::Principal; -use golem_common::model::component::ComponentRevision; +use golem_common::model::component::{ComponentId, ComponentRevision}; use golem_common::model::environment::EnvironmentId; use golem_common::model::invocation_context::InvocationContextStack; use golem_common::model::worker::AgentConfigEntryDto; @@ -80,8 +83,22 @@ pub struct ActiveWorkers { /// when short. The estimate-based `worker_memory` semaphore is the cheap /// pre-filter and atomic commit in front of it. admission: AdmissionController, + /// Charges each resident component's compiled module size to the estimate + /// pool exactly once (shared across all its workers) rather than per worker. + component_charges: + Arc>>, + /// Multiplier applied to a component's `component_size` when sizing its + /// module charge permit. + component_size_coefficient: f64, } +/// Identifies a compiled component for module-charge accounting. +type ComponentChargeKey = (ComponentId, ComponentRevision); + +/// Guard held by a resident worker keeping its component's module charge alive. +pub type WorkerComponentCharge = + ComponentChargeGuard>; + #[derive(Debug)] pub struct WorkerMemoryPermit { permit: Option, @@ -124,27 +141,56 @@ impl ActiveWorkers { default_probe(memory_config.total_system_memory()), memory_config.admission_policy(), ); + let workers = Cache::new( + None, + FullCacheEvictionMode::None, + BackgroundEvictionMode::None, + "active_workers", + ); + let worker_memory = Arc::new(Semaphore::new(worker_memory_size)); + let priority_allocation_lock = Arc::new(Mutex::new(())); + let component_charges = ComponentChargeRegistry::new(MemoryPoolChargeSource { + worker_memory: worker_memory.clone(), + workers: workers.clone(), + priority_allocation_lock: priority_allocation_lock.clone(), + acquire_retry_delay: memory_config.acquire_retry_delay, + }); let active_workers = Self { - workers: Cache::new( - None, - FullCacheEvictionMode::None, - BackgroundEvictionMode::None, - "active_workers", - ), - worker_memory: Arc::new(Semaphore::new(worker_memory_size)), + workers, + worker_memory, worker_filesystem_storage: Arc::new(FilesystemStorageSemaphore::new( storage_config.worker_filesystem_storage(), storage_config.acquire_retry_delay, )), concurrent_agents: Arc::new(ConcurrentAgentsScheduler::new()), acquire_retry_delay: memory_config.acquire_retry_delay, - priority_allocation_lock: Arc::new(Mutex::new(())), + priority_allocation_lock, admission, + component_charges, + component_size_coefficient: memory_config.component_size_coefficient, }; active_workers.initialize_metrics(worker_memory_size); active_workers } + /// Acquire (or share) the per-component module charge for a worker of the + /// given component. The first resident worker of the component pays its + /// compiled-module size (scaled by `component_size_coefficient`) into the + /// estimate pool; subsequent workers share the same charge. The returned + /// guard releases residency on drop, and the charge is freed when the last + /// worker of the component unloads. + pub async fn acquire_component_charge( + &self, + component_id: ComponentId, + component_revision: ComponentRevision, + component_module_bytes: u64, + ) -> WorkerComponentCharge { + let charge_bytes = (self.component_size_coefficient * component_module_bytes as f64) as u64; + self.component_charges + .acquire((component_id, component_revision), charge_bytes) + .await + } + pub async fn get_or_add( &self, deps: &T, @@ -237,44 +283,21 @@ impl ActiveWorkers { continue; } - let available = self.worker_memory.available_permits(); - let lock = self.priority_allocation_lock.lock().await; // Block trying until a priority request is retrying once - let result = self.worker_memory.clone().try_acquire_many_owned(mem32); - drop(lock); - match result { - Ok(permit) => { - debug!( - "Acquired {} memory of {}, new available: {}, permit size: {}", - mem32, - available, - self.worker_memory.available_permits(), - permit.num_permits() - ); - break WorkerMemoryPermit::new(permit); - } - Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"), - Err(TryAcquireError::NoPermits) => { - debug!( - "Not enough memory to allocate {mem32} (available: {}), trying to free some up", - self.worker_memory.available_permits() - ); - if self.try_free_up_memory(memory).await { - debug!("Freed up some memory, retrying"); - // We have enough memory unless another worker has taken it in the meantime, - // so retry the loop - continue; - } else { - debug!( - "Could not free up memory, retrying asking for permits after some time" - ); - // Could not free up enough memory, so waiting for permits to be available. - // We cannot use acquire_many() to wait for the permits because it eagerly preallocates - // the available permits, and by that causing deadlocks. So we sleep and retry. - - tokio::time::sleep(self.acquire_retry_delay).await; - } - } + // Estimate-semaphore pool: cheap pre-check + atomic commit. + if let Some(permit) = acquire_pool_permit( + &self.worker_memory, + &self.workers, + &self.priority_allocation_lock, + self.acquire_retry_delay, + mem32, + memory, + ) + .await + { + break permit; } + // Pool could not satisfy the estimate even after eviction; loop and + // re-run the gate before trying again. } } @@ -336,29 +359,6 @@ impl ActiveWorkers { } } - async fn try_free_up_memory(&self, memory: u64) -> bool { - let current_avail = self.worker_memory.available_permits(); - let needed = memory.saturating_sub(current_avail as u64); - - if needed == 0 { - debug!("Memory was freed up in the meantime"); - return true; - } - - let mut freed = 0u64; - for priority in [EvictionPriority::Idle, EvictionPriority::Warm] { - if freed >= needed { - break; - } - freed += evict_at_most_memory(&self.workers, priority, needed - freed).await; - } - - if freed > 0 { - debug!("Freed up {freed}"); - } - freed >= needed - } - /// Blocking acquire of storage semaphore permits. Loops until the requested /// number of bytes is available, evicting idle workers as needed. pub async fn acquire_filesystem_storage(&self, storage_bytes: u64) -> FilesystemStoragePermit { @@ -531,8 +531,62 @@ async fn evict_at_most_memory( freed } -/// Adapts the live worker set to the [`EvictionSource`] the admission controller -/// drives. Holds a cheap clone of the worker cache handle. +/// Frees up to `memory` estimate-permit bytes by evicting idle-then-warm +/// workers, accounting for permits already available. Returns true when enough +/// is (or was already) free. +async fn try_free_up_pool_memory( + worker_memory: &Semaphore, + workers: &Cache>, WorkerExecutorError>, + memory: u64, +) -> bool { + let current_avail = worker_memory.available_permits(); + let needed = memory.saturating_sub(current_avail as u64); + if needed == 0 { + return true; + } + + let mut freed = 0u64; + for priority in [EvictionPriority::Idle, EvictionPriority::Warm] { + if freed >= needed { + break; + } + freed += evict_at_most_memory(workers, priority, needed - freed).await; + } + freed >= needed +} + +/// Single estimate-semaphore acquisition attempt with eviction. Returns the +/// permit on success, or `None` when the pool cannot satisfy `mem32` even after +/// evicting idle/warm workers (caller decides whether to retry). Shared by +/// `ActiveWorkers::acquire` and the per-component charge source so there is one +/// pool-acquire implementation. +async fn acquire_pool_permit( + worker_memory: &Arc, + workers: &Cache>, WorkerExecutorError>, + priority_allocation_lock: &Mutex<()>, + acquire_retry_delay: Duration, + mem32: u32, + memory: u64, +) -> Option { + let lock = priority_allocation_lock.lock().await; // Block trying until a priority request is retrying once + let result = worker_memory.clone().try_acquire_many_owned(mem32); + drop(lock); + match result { + Ok(permit) => Some(WorkerMemoryPermit::new(permit)), + Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"), + Err(TryAcquireError::NoPermits) => { + if try_free_up_pool_memory(worker_memory, workers, memory).await { + // Freed enough; signal the caller to retry the acquire. + None + } else { + // Could not free enough; wait before the caller retries. + tokio::time::sleep(acquire_retry_delay).await; + None + } + } + } +} + struct WorkerEvictionSource { workers: Cache>, WorkerExecutorError>, } @@ -543,3 +597,37 @@ impl EvictionSource for WorkerEvictionSource { evict_at_most_memory(&self.workers, priority, needed_bytes).await } } + +/// Production [`ChargeSource`] for the per-component module charge. Takes +/// estimate-semaphore permits via the same pool acquire+evict path as worker +/// memory (the measured-headroom gate already accounts for the resident module +/// via real RSS, so the charge does not pass through it). +pub struct MemoryPoolChargeSource { + worker_memory: Arc, + workers: Cache>, WorkerExecutorError>, + priority_allocation_lock: Arc>, + acquire_retry_delay: Duration, +} + +#[async_trait] +impl ChargeSource for MemoryPoolChargeSource { + type Charge = WorkerMemoryPermit; + + async fn acquire_charge(&self, bytes: u64) -> WorkerMemoryPermit { + let mem32: u32 = bytes.try_into().expect("component charge size too large"); + loop { + if let Some(permit) = acquire_pool_permit( + &self.worker_memory, + &self.workers, + &self.priority_allocation_lock, + self.acquire_retry_delay, + mem32, + bytes, + ) + .await + { + break permit; + } + } + } +} diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index 29ea514cb6..fdac19ed9c 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -963,11 +963,8 @@ pub struct MemoryConfig { pub system_memory_override: Option, pub worker_memory_ratio: f64, pub worker_estimate_coefficient: f64, - /// Multiplier applied to a worker's `component_size` when estimating its - /// memory permit requirement. The compiled component is loaded into the - /// engine once per component (shared across all workers of that component), - /// so this term over-accounts per-worker memory for large components. - /// Lower this (e.g. to 0.0) to size permits primarily off linear memory. + /// Multiplier applied to a component's `component_size`, charged once per + /// resident component (shared across all its workers) rather than per worker. pub component_size_coefficient: f64, /// Bytes of measured headroom kept free below the usable ceiling as a margin /// against concurrent admissions overshooting before becoming resident. Used diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs index a65d6dd867..e9e8dbed8f 100644 --- a/golem-worker-executor/src/worker/mod.rs +++ b/golem-worker-executor/src/worker/mod.rs @@ -27,7 +27,8 @@ use crate::durable_host::recover_stderr_logs; use crate::metrics::storage::record_filesystem_pool_released; use crate::model::{AgentConfig, ExecutionStatus, LookupResult, ReadFileResult, TrapType}; use crate::services::active_workers::{ - FilesystemStoragePermit, RegisteredConcurrentAccount, WorkerMemoryPermit, + FilesystemStoragePermit, HeldComponentCharge, RegisteredConcurrentAccount, + WorkerComponentCharge, WorkerMemoryPermit, }; use crate::services::events::{Event, EventsSubscription}; use crate::services::golem_config::SnapshotPolicy; @@ -58,6 +59,7 @@ use golem_common::model::agent::{ AgentMode, ParsedAgentId, Principal, Snapshotting, SnapshottingConfig, }; use golem_common::model::component::CanonicalFilePath; +use golem_common::model::component::ComponentId; use golem_common::model::component::ComponentRevision; use golem_common::model::invocation_context::InvocationContextStack; use golem_common::model::oplog::{OplogEntry, OplogIndex, UpdateDescription}; @@ -122,7 +124,6 @@ pub struct Worker { execution_status: Arc>, update_state_lock: Mutex<()>, worker_estimate_coefficient: f64, - component_size_coefficient: f64, // IMPORTANT: Every external operation must acquire the instance lock, even briefly, to confirm the worker isn’t deleting. instance: Arc>, @@ -341,7 +342,6 @@ impl Worker { last_known_status: current_status, metrics_status, worker_estimate_coefficient: deps.config().memory.worker_estimate_coefficient, - component_size_coefficient: deps.config().memory.component_size_coefficient, oom_retry_config: deps.config().memory.oom_retry_config.clone(), snapshot_policy, update_state_lock: Mutex::new(()), @@ -797,15 +797,29 @@ impl Worker { self.execution_status.read().unwrap().agent_mode() } - /// Gets the estimated memory requirement of the worker + /// Gets the estimated memory requirement of the worker. + /// + /// This covers only the per-worker linear memory. The compiled component + /// module is shared by all workers of a component and is charged once per + /// resident component via the component-charge registry, not per worker. pub async fn memory_requirement(&self) -> Result { let metadata = self.get_latest_worker_metadata().await; - let ml = metadata.last_known_status.total_linear_memory_size as f64; - let sw = metadata.last_known_status.component_size as f64; - let c = self.component_size_coefficient; - let x = self.worker_estimate_coefficient; - Ok((x * (ml + c * sw)) as u64) + let linear_memory_bytes = metadata.last_known_status.total_linear_memory_size as f64; + let estimate_coefficient = self.worker_estimate_coefficient; + Ok((estimate_coefficient * linear_memory_bytes) as u64) + } + + /// Returns the component identity and compiled-module size used to charge + /// the shared module memory once per resident component. + pub async fn component_charge_requirement( + &self, + ) -> Result<(ComponentId, ComponentRevision, u64), WorkerExecutorError> { + let metadata = self.get_latest_worker_metadata().await; + let component_id = self.owned_agent_id.component_id(); + let component_revision = metadata.last_known_status.component_revision; + let component_module_bytes = metadata.last_known_status.component_size; + Ok((component_id, component_revision, component_module_bytes)) } /// Gets the storage requirement of the worker based on the last known status. @@ -2192,6 +2206,7 @@ impl Worker { async fn start_waiting_worker( this: Arc>, permit: WorkerMemoryPermit, + component_charge: WorkerComponentCharge, filesystem_storage_permit: Option, concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit, oom_retry_count: u32, @@ -2207,6 +2222,7 @@ impl Worker { this.queue.clone(), this.clone(), permit, + component_charge, concurrent_agent_permit, oom_retry_count, ) @@ -2361,6 +2377,27 @@ impl WaitingWorker { // concurrency slot. Otherwise one account could fill the memory // pool with workers that are not allowed to run yet. let permit = parent.active_workers().acquire(memory_requirement).await; + // Charge the component's compiled module size once per resident + // component (shared by all its workers). Held for as long as this + // worker is resident. + let component_charge = match parent.component_charge_requirement().await { + Ok((component_id, component_revision, component_module_bytes)) => { + parent + .active_workers() + .acquire_component_charge( + component_id, + component_revision, + component_module_bytes, + ) + .await + } + Err(err) => { + warn!( + "Failed to determine component charge requirement, not starting: {err}" + ); + return; + } + }; // Pre-acquire storage permits for this restart. // // We need to acquire `filesystem_storage_requirement + desired_extra` total: @@ -2412,6 +2449,7 @@ impl WaitingWorker { Worker::start_waiting_worker( parent, permit, + component_charge, filesystem_storage_permit, concurrent_agent_permit, oom_retry_count, @@ -2444,6 +2482,12 @@ struct RunningWorker { sender: UnboundedSender, queue: Arc>>, permit: WorkerMemoryPermit, + /// Keeps this worker's component module charge alive for as long as the + /// worker is resident. Held only to be dropped: dropping it releases the + /// component's residency, and the module charge if this was the last worker + /// of the component. + #[allow(dead_code)] + component_charge: Box, /// Storage semaphore permits held by this worker. `None` until storage /// space is first acquired (at startup or on first write). Dropped /// automatically when `RunningWorker` is dropped, returning storage @@ -2475,6 +2519,7 @@ impl RunningWorker { queue: Arc>>, parent: Arc>, permit: WorkerMemoryPermit, + component_charge: WorkerComponentCharge, concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit, oom_retry_count: u32, ) -> Self { @@ -2525,6 +2570,7 @@ impl RunningWorker { sender, queue, permit, + component_charge: Box::new(component_charge), filesystem_storage_permit: None, waiting_for_command, interrupt_signal, From 35874d34e7a628a5093dee4e8c4f4843b8305adf Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 00:00:46 -0700 Subject: [PATCH 22/60] fix(worker-executor): disable measured admission when executor does not own its memory environment --- .../config/debug-worker-executor.sample.env | 2 + .../config/debug-worker-executor.toml | 2 + golem-worker-executor-test-utils/src/lib.rs | 12 +++++ .../config/worker-executor.sample.env | 3 ++ .../config/worker-executor.toml | 3 ++ .../services/active_workers/memory_probe.rs | 27 ++++++++---- .../src/services/active_workers/mod.rs | 44 +++++++++---------- .../src/services/golem_config.rs | 13 ++++++ 8 files changed, 76 insertions(+), 30 deletions(-) diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env index d717cf777a..7d95b6f7dc 100644 --- a/golem-debugging-service/config/debug-worker-executor.sample.env +++ b/golem-debugging-service/config/debug-worker-executor.sample.env @@ -57,6 +57,7 @@ GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 +GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 @@ -232,6 +233,7 @@ GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 +GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml index 7d23b08cd5..316dddd29a 100644 --- a/golem-debugging-service/config/debug-worker-executor.toml +++ b/golem-debugging-service/config/debug-worker-executor.toml @@ -98,6 +98,7 @@ max_oplog_query_pages_size = 100 acquire_retry_delay = "500ms" admission_reserve_bytes = 268435456 component_size_coefficient = 2.0 +enable_measured_admission = true worker_estimate_coefficient = 1.1 worker_memory_ratio = 0.8 @@ -368,6 +369,7 @@ without_time = false # acquire_retry_delay = "500ms" # admission_reserve_bytes = 268435456 # component_size_coefficient = 2.0 +# enable_measured_admission = true # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 # diff --git a/golem-worker-executor-test-utils/src/lib.rs b/golem-worker-executor-test-utils/src/lib.rs index ee81b41531..fcfb661670 100644 --- a/golem-worker-executor-test-utils/src/lib.rs +++ b/golem-worker-executor-test-utils/src/lib.rs @@ -533,6 +533,15 @@ fn make_base_test_config(deps: &WorkerExecutorTestDependencies) -> GolemConfig { // without attempting a gRPC connection to a registry service that does // not exist in this test setup. resource_limits: ResourceLimitsConfig::Disabled(ResourceLimitsDisabledConfig {}), + // The measured-headroom admission gate requires the executor to own its + // memory environment (cgroup/process). The in-process test harness runs + // the executor alongside the test framework and other services, so the + // probe cannot isolate this executor's footprint — disable it and gate on + // the estimate semaphore alone, matching pre-gate behaviour. + memory: MemoryConfig { + enable_measured_admission: false, + ..Default::default() + }, ..Default::default() } } @@ -696,6 +705,9 @@ pub async fn start_customized( apply_sqlite_storage_config(&mut config, deps, context); config.memory = MemoryConfig { system_memory_override, + // Measured admission disabled in the shared in-process test harness; the + // small system_memory_override here drives the estimate semaphore alone. + enable_measured_admission: false, ..Default::default() }; config.filesystem_storage = FilesystemStorageConfig { diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env index dc33d7b3c1..bc7bf2c3c0 100644 --- a/golem-worker-executor/config/worker-executor.sample.env +++ b/golem-worker-executor/config/worker-executor.sample.env @@ -74,6 +74,7 @@ GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 +GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 @@ -295,6 +296,7 @@ GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 +GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 @@ -486,6 +488,7 @@ GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 +GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml index 265ec5f904..819c4fe03d 100644 --- a/golem-worker-executor/config/worker-executor.toml +++ b/golem-worker-executor/config/worker-executor.toml @@ -127,6 +127,7 @@ max_oplog_query_pages_size = 100 acquire_retry_delay = "500ms" admission_reserve_bytes = 268435456 component_size_coefficient = 2.0 +enable_measured_admission = true worker_estimate_coefficient = 1.1 worker_memory_ratio = 0.8 @@ -460,6 +461,7 @@ without_time = false # acquire_retry_delay = "500ms" # admission_reserve_bytes = 268435456 # component_size_coefficient = 2.0 +# enable_measured_admission = true # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 # @@ -763,6 +765,7 @@ without_time = false # acquire_retry_delay = "500ms" # admission_reserve_bytes = 268435456 # component_size_coefficient = 2.0 +# enable_measured_admission = true # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 # diff --git a/golem-worker-executor/src/services/active_workers/memory_probe.rs b/golem-worker-executor/src/services/active_workers/memory_probe.rs index 0d1c4088a3..346a3dd363 100644 --- a/golem-worker-executor/src/services/active_workers/memory_probe.rs +++ b/golem-worker-executor/src/services/active_workers/memory_probe.rs @@ -163,18 +163,29 @@ impl MemoryProbe for CgroupV2Probe { } } -/// Constructs the best available probe for the current platform. +/// Constructs the best available probe. /// -/// On Linux, prefers cgroup v2; falls back to process RSS. On other targets, -/// uses process RSS until dedicated backends land. `limit_bytes` is the limit -/// to charge against and is also the fallback when the cgroup reports an -/// unlimited `memory.max`. -pub fn default_probe(limit_bytes: u64) -> Box { +/// When `memory_override` is set, the limit is self-declared and treated as an +/// isolated budget measured against this process's RSS — the executor does not +/// assume it owns a cgroup. When it is `None`, the executor is assumed to own +/// its memory environment, so on Linux the exact cgroup v2 numbers are used +/// (falling back to host RAM / process RSS otherwise). +pub fn default_probe(memory_override: Option) -> Box { + if let Some(limit) = memory_override { + return Box::new(ProcessRssProbe::new(limit)); + } + + let host_ram = { + let mut sysinfo = sysinfo::System::new(); + sysinfo.refresh_memory(); + sysinfo.total_memory() + }; + #[cfg(target_os = "linux")] { - if let Some(probe) = CgroupV2Probe::try_new(limit_bytes) { + if let Some(probe) = CgroupV2Probe::try_new(host_ram) { return Box::new(probe); } } - Box::new(ProcessRssProbe::new(limit_bytes)) + Box::new(ProcessRssProbe::new(host_ram)) } diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs index 34f8b190ec..7e95da1703 100644 --- a/golem-worker-executor/src/services/active_workers/mod.rs +++ b/golem-worker-executor/src/services/active_workers/mod.rs @@ -81,8 +81,10 @@ pub struct ActiveWorkers { /// Authoritative measured-headroom admission gate. Decides whether real /// memory headroom permits a new acquisition, evicting via the worker set /// when short. The estimate-based `worker_memory` semaphore is the cheap - /// pre-filter and atomic commit in front of it. - admission: AdmissionController, + /// pre-filter and atomic commit in front of it. `None` when measured + /// admission is disabled (e.g. shared test environments) — admission then + /// relies on the estimate semaphore alone. + admission: Option, /// Charges each resident component's compiled module size to the estimate /// pool exactly once (shared across all its workers) rather than per worker. component_charges: @@ -137,10 +139,12 @@ impl Drop for WorkerMemoryPermit { impl ActiveWorkers { pub fn new(memory_config: &MemoryConfig, storage_config: &FilesystemStorageConfig) -> Self { let worker_memory_size = memory_config.worker_memory(); - let admission = AdmissionController::new( - default_probe(memory_config.total_system_memory()), - memory_config.admission_policy(), - ); + let admission = memory_config.enable_measured_admission.then(|| { + AdmissionController::new( + default_probe(memory_config.system_memory_override), + memory_config.admission_policy(), + ) + }); let workers = Cache::new( None, FullCacheEvictionMode::None, @@ -269,14 +273,12 @@ impl ActiveWorkers { .expect("requested memory size is too large"); loop { - // Authoritative measured-headroom gate. Evicts idle-then-warm when - // real headroom is short; rejects (and we back off) when it cannot - // make room rather than risking the limit. - if self - .admission - .try_admit(memory, &self.eviction_source()) - .await - == AdmissionDecision::Reject + // Authoritative measured-headroom gate (when enabled). Evicts + // idle-then-warm when real headroom is short; rejects (and we back + // off) when it cannot make room rather than risking the limit. + if let Some(admission) = &self.admission + && admission.try_admit(memory, &self.eviction_source()).await + == AdmissionDecision::Reject { debug!("Measured headroom insufficient for {mem32}, backing off and retrying"); tokio::time::sleep(self.acquire_retry_delay).await; @@ -314,14 +316,12 @@ impl ActiveWorkers { .try_into() .expect("requested memory size is too large"); - // Authoritative measured-headroom gate. Single attempt (this is the - // non-blocking path): if real headroom is insufficient even after - // eviction, do not admit. - if self - .admission - .try_admit(memory, &self.eviction_source()) - .await - == AdmissionDecision::Reject + // Authoritative measured-headroom gate (when enabled). Single attempt + // (this is the non-blocking path): if real headroom is insufficient even + // after eviction, do not admit. + if let Some(admission) = &self.admission + && admission.try_admit(memory, &self.eviction_source()).await + == AdmissionDecision::Reject { debug!("Measured headroom insufficient for {mem32}, not admitting"); return None; diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index fdac19ed9c..5a95e0056f 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -970,6 +970,13 @@ pub struct MemoryConfig { /// against concurrent admissions overshooting before becoming resident. Used /// by the measured-headroom admission gate. pub admission_reserve_bytes: u64, + /// Whether the measured-headroom admission gate is active. Requires the + /// executor to own its memory environment (its own cgroup/process), as in a + /// production pod. Disable in shared environments — such as the in-process + /// test harness — where the probe cannot isolate this executor's footprint + /// from co-resident processes; admission then relies on the estimate + /// semaphore alone. + pub enable_measured_admission: bool, #[serde(with = "humantime_serde")] pub acquire_retry_delay: Duration, pub oom_retry_config: RetryConfig, @@ -1032,6 +1039,11 @@ impl SafeDisplay for MemoryConfig { "admission reserve bytes: {}", self.admission_reserve_bytes ); + let _ = writeln!( + &mut result, + "measured admission enabled: {}", + self.enable_measured_admission + ); let _ = writeln!( &mut result, "acquire retry delay: {:?}", @@ -1558,6 +1570,7 @@ impl Default for MemoryConfig { worker_estimate_coefficient: 1.1, component_size_coefficient: 2.0, admission_reserve_bytes: 256 * 1024 * 1024, + enable_measured_admission: true, acquire_retry_delay: Duration::from_millis(500), oom_retry_config: RetryConfig { max_attempts: u32::MAX, From acb9968ca3008f71d6ee254148fd085933ec9a9a Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 00:01:18 -0700 Subject: [PATCH 23/60] feat(benchmark): add throughput-under-memory-saturation benchmarks --- .../cloud-density-saturation.yaml | 64 +++ integration-tests/src/benchmarks/all.rs | 24 + integration-tests/src/benchmarks/mod.rs | 1 + .../src/benchmarks/throughput_saturation.rs | 423 ++++++++++++++++++ test-components/agent-counters/src/lib.rs | 91 +++- 5 files changed, 601 insertions(+), 2 deletions(-) create mode 100644 integration-tests/benchmark_suites/cloud-density-saturation.yaml create mode 100644 integration-tests/src/benchmarks/throughput_saturation.rs diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml new file mode 100644 index 0000000000..59d1409efc --- /dev/null +++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml @@ -0,0 +1,64 @@ +# Cloud throughput-saturation benchmark suite. +# +# Unlike cloud-perf's throughput benchmarks (which keep `size` small enough that +# all workers fit in memory), this suite deliberately ramps the number of +# active, memory-holding agents up to and past the executor's memory ceiling to +# find the per-pod active-agent capacity and the throughput sustained once +# memory is exhausted. +# +# Each agent retains a deterministic, per-agent-distinct amount of resident +# memory, so the fleet presents a mix of footprints near the limit (exercising +# the admission/eviction path). The measured phase drives one in-flight +# `busy_for` call per agent and records aggregate throughput. +# +# Run with the benchmarks binary's `cloud` subcommand (same flags as cloud-perf): +# +# benchmarks suite integration-tests/benchmark_suites/cloud-density-saturation.yaml \ +# --save-to-json result.json \ +# cloud --api-url https:// --apps-base-domain \ +# --admin-account-token --builtin-plugin-owner-account-id \ +# --default-plan-id --component-directory +# +# Reading the result: plot `saturation-throughput-ops-per-sec` and +# invocation-retries/timeouts against `size`. Throughput climbs with `size` +# until the pod's memory is exhausted, then plateaus or drops while retries and +# eviction churn rise — that knee is the active-agent ceiling. +# +# `clusterSize` is ignored in cloud mode (single observed cluster). + +name: cloud-density-saturation +benchmarks: + # Rust echo agents — lean per-instance linear memory (the ~900 KB module is + # charged once per component, shared across all agents; what scales per agent + # is the small instance heap). Knee expected in the thousands, so sweep high. + # The current admission algorithm craters around ~700 agents/node; the reworked + # admission is expected to push this knee substantially higher. + - name: throughput-saturation-echo-rust + iterations: 3 + clusterSize: [2] + size: [500, 1000, 2000, 4000, 8000, 12000] + length: [0] + + # TypeScript echo agents — each instance instantiates its own QuickJS runtime + # and JS heap in its own linear memory (the 17.4 MB module is shared once per + # component; the per-instance runtime state is the heavy per-agent cost). + # Expect the knee in the hundreds, well below the Rust variant. + - name: throughput-saturation-echo-ts + iterations: 3 + clusterSize: [2] + size: [100, 250, 500, 750, 1000, 1500, 2000] + length: [0] + + # Synthetic footprint — each agent retains a deterministic per-agent-distinct + # amount of resident memory, exercising the admission/eviction path with a + # controllable footprint near the limit. + # size = number of active, memory-holding agents (the ramp axis) + # length = base per-agent memory footprint in bytes; each agent retains a + # deterministic multiple (1x..8x), averaging ~4.5x. 4 MiB base => + # ~18 MiB average per agent, filling a ~10 GiB usable pool around + # ~580 agents (bracketing the old ~700 crater point). + - name: throughput-saturation-counters + iterations: 3 + clusterSize: [2] + size: [100, 250, 500, 750, 1000, 1500] + length: [4194304] diff --git a/integration-tests/src/benchmarks/all.rs b/integration-tests/src/benchmarks/all.rs index 9b1efd1eb2..e79ac78612 100644 --- a/integration-tests/src/benchmarks/all.rs +++ b/integration-tests/src/benchmarks/all.rs @@ -133,6 +133,30 @@ async fn main() { >(mode, verbosity, item, primary_only, otlp)) }), ); + benchmarks_by_name.insert( + "throughput-saturation-counters", + Box::new(|mode, verbosity, item, primary_only, otlp| { + Box::pin(run_benchmark::< + integration_tests::benchmarks::throughput_saturation::ThroughputSaturationCounters, + >(mode, verbosity, item, primary_only, otlp)) + }), + ); + benchmarks_by_name.insert( + "throughput-saturation-echo-rust", + Box::new(|mode, verbosity, item, primary_only, otlp| { + Box::pin(run_benchmark::< + integration_tests::benchmarks::throughput_saturation::ThroughputSaturationEchoRust, + >(mode, verbosity, item, primary_only, otlp)) + }), + ); + benchmarks_by_name.insert( + "throughput-saturation-echo-ts", + Box::new(|mode, verbosity, item, primary_only, otlp| { + Box::pin(run_benchmark::< + integration_tests::benchmarks::throughput_saturation::ThroughputSaturationEchoTs, + >(mode, verbosity, item, primary_only, otlp)) + }), + ); let params = BenchmarkCliParameters::parse_from(std::env::args_os()); let tracer_provider = BenchmarkTestDependencies::init_logging(¶ms); diff --git a/integration-tests/src/benchmarks/mod.rs b/integration-tests/src/benchmarks/mod.rs index d1651f063f..0682055643 100644 --- a/integration-tests/src/benchmarks/mod.rs +++ b/integration-tests/src/benchmarks/mod.rs @@ -35,6 +35,7 @@ pub mod durability_overhead; pub mod latency; pub mod sleep; pub mod throughput; +pub mod throughput_saturation; // Re-export cleanup helpers so callers can use the flat `benchmarks::*` path. pub use cleanup::{cleanup_account, cleanup_env_and_app, cleanup_user_state}; diff --git a/integration-tests/src/benchmarks/throughput_saturation.rs b/integration-tests/src/benchmarks/throughput_saturation.rs new file mode 100644 index 0000000000..665ff3ec48 --- /dev/null +++ b/integration-tests/src/benchmarks/throughput_saturation.rs @@ -0,0 +1,423 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Throughput-under-memory-saturation benchmarks. +//! +//! Unlike the regular throughput benchmark — which keeps `size` small enough +//! that all workers fit comfortably in memory — these benchmarks deliberately +//! ramp the number of *active* agents up to and past the executor's memory +//! ceiling, to find the knee: the agent count where the pod can still keep +//! everything resident (latency flat, throughput scaling linearly) just before +//! it starts evicting and replaying (latency spikes, throughput craters). +//! +//! The measured `run` phase drives sustained load over a fixed window: each +//! agent repeatedly does a short unit of work then goes idle for [`IDLE_GAP`]. +//! During that gap the agent has no in-flight work and becomes a `LoadedIdle` +//! eviction candidate, so under memory pressure it can be evicted and then must +//! reload (oplog replay + re-admission) on its next call — the churn that makes +//! throughput crater past the knee. Starts are staggered so the fleet is not +//! synchronised. +//! +//! Three variants: +//! - `throughput-saturation-counters`: agent-counters with a synthetic, +//! per-agent-distinct retained footprint (`allocate_memory`) plus CPU work +//! (`busy_for`). The footprint is controllable via `length`. +//! - `throughput-saturation-echo-rust` / `throughput-saturation-echo-ts`: the +//! benchmark `echo` agent (Rust / TS) called repeatedly. No synthetic +//! footprint — the per-agent memory is the agent's natural footprint, which +//! for the TS agent includes the QuickJS runtime. Answers "how many actively +//! invoked echo agents fit per pod". +//! +//! Parameters: +//! - `size` = number of active agents in this step (the ramp axis). +//! - `length` = for the counters variant, the base per-agent memory footprint in +//! bytes (agent `i` retains a deterministic multiple); ignored by the echo +//! variants. + +use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent}; +use async_trait::async_trait; +use futures_concurrency::future::Join; +use golem_common::base_model::agent::{DataValue, ParsedAgentId}; +use golem_common::model::AgentId; +use golem_common::model::component::ComponentDto; +use golem_common::model::environment::EnvironmentId; +use golem_common::{agent_id, data_value}; +use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, ResultKey, RunConfig}; +use golem_test_framework::config::benchmark::TestMode; +use golem_test_framework::config::dsl_impl::TestUserContext; +use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies}; +use golem_test_framework::dsl::{TestDsl, TestDslExtended}; +use indoc::indoc; +use std::time::{Duration, Instant}; +use tracing::{Instrument, Level, info}; + +/// Number of distinct footprint buckets the synthetic per-agent memory spread +/// cycles through, so the fleet holds a mix of sizes rather than a uniform +/// amount. +const SPREAD_BUCKETS: usize = 8; + +/// CPU busy time (ms) per `busy_for` invocation (counters variant only). +const BUSY_MILLIS: u32 = 50; + +/// Idle gap each agent sleeps between calls. During this gap the agent has no +/// in-flight work and becomes a `LoadedIdle` eviction candidate. Under memory +/// pressure it may be evicted and then must reload on its next call — the churn +/// this benchmark exists to measure. +const IDLE_GAP: Duration = Duration::from_millis(200); + +/// Total measured wall-clock duration of the sustained-load phase. Throughput +/// and churn are measured over this fixed window so steps with different `size` +/// are comparable. +const RUN_DURATION: Duration = Duration::from_secs(30); + +/// Maximum per-agent start stagger, so the fleet is not synchronised: at any +/// instant some agents are mid-call (demanding memory) while others sit idle +/// (evictable). +const MAX_STAGGER: Duration = Duration::from_millis(250); + +/// Resident memory (bytes) the synthetic-footprint agent `index` retains for a +/// given `base`. Spreads deterministically across [`SPREAD_BUCKETS`] buckets +/// (`base * 1` .. `base * SPREAD_BUCKETS`) so different agents hold different +/// amounts and some sit much closer to the limit than others. +fn agent_memory_bytes(index: usize, base: usize) -> u32 { + let bucket = (index % SPREAD_BUCKETS) + 1; + (base.saturating_mul(bucket)).min(u32::MAX as usize) as u32 +} + +/// Per-agent start offset derived deterministically from the index, spread +/// across `[0, MAX_STAGGER)`. +fn agent_stagger(index: usize) -> Duration { + let frac = (index as u32).wrapping_mul(2_654_435_761) % 1000; + MAX_STAGGER.checked_mul(frac).unwrap_or_default() / 1000 +} + +/// Describes one saturation variant: which component to load, which agent type +/// and method to actively invoke, and whether to pre-load a synthetic footprint. +struct SaturationVariant { + /// WASM file name (without `.wasm`) in the component directory. + wasm_name: &'static str, + /// Registry display name for the component. + component_name: &'static str, + /// Agent type to instantiate. + agent_type: &'static str, + /// Method invoked repeatedly during the measured phase. + active_method: &'static str, + /// Builds the parameter for one `active_method` call. + active_params: fn() -> DataValue, + /// When set, each agent calls this method once in warmup with its + /// deterministic footprint (`allocate_memory`-style). `None` for the echo + /// variants, whose footprint is the agent's natural memory. + allocate_method: Option<&'static str>, +} + +const COUNTERS_VARIANT: SaturationVariant = SaturationVariant { + wasm_name: "it_agent_counters_release", + component_name: "it:agent-counters", + agent_type: "Counter", + active_method: "busy-for", + active_params: || data_value!(BUSY_MILLIS), + allocate_method: Some("allocate-memory"), +}; + +const ECHO_RUST_VARIANT: SaturationVariant = SaturationVariant { + wasm_name: "benchmark_agent_rust_release", + component_name: "benchmark:agent-rust", + agent_type: "RustBenchmarkAgent", + active_method: "echo", + active_params: || data_value!("saturation"), + allocate_method: None, +}; + +const ECHO_TS_VARIANT: SaturationVariant = SaturationVariant { + wasm_name: "benchmark_agent_ts", + component_name: "benchmark:agent-ts", + agent_type: "BenchmarkAgent", + active_method: "echo", + active_params: || data_value!("saturation"), + allocate_method: None, +}; + +pub struct SaturationBenchmarkContext { + deps: BenchmarkTestDependencies, +} + +pub struct SaturationIterationContext { + user: TestUserContext, + component: ComponentDto, + agent_ids: Vec, + base_memory_bytes: usize, + env_id: EnvironmentId, +} + +/// Shared implementation for all saturation variants. The variant-specific +/// config is supplied by the wrapper types' `variant()`. +async fn create_context( + mode: &TestMode, + verbosity: Level, + cluster_size: usize, + disable_compilation_cache: bool, + otlp: bool, +) -> SaturationBenchmarkContext { + SaturationBenchmarkContext { + deps: BenchmarkTestDependencies::new( + mode, + verbosity, + cluster_size, + disable_compilation_cache, + otlp, + ) + .await, + } +} + +async fn setup_iteration( + variant: &SaturationVariant, + config: &RunConfig, + benchmark_context: &SaturationBenchmarkContext, +) -> SaturationIterationContext { + let user = benchmark_context.deps.user().await.unwrap(); + let (_, env) = user.app_and_env().await.unwrap(); + + info!("Registering component {}", variant.component_name); + let component = user + .component(&env.id, variant.wasm_name) + .name(variant.component_name) + .store() + .await + .unwrap(); + + let mut agent_ids = vec![]; + for n in 0..config.size { + agent_ids.push(agent_id!(variant.agent_type, format!("saturation-{n}"))); + } + + SaturationIterationContext { + user, + component, + agent_ids, + base_memory_bytes: config.length, + env_id: env.id, + } +} + +async fn warmup(variant: &SaturationVariant, context: &SaturationIterationContext) { + let Some(allocate_method) = variant.allocate_method else { + // Echo variants: nothing to pre-load; the agent's natural footprint is + // established on first invocation. + return; + }; + + async { + let base = context.base_memory_bytes; + let result_futures = context + .agent_ids + .iter() + .enumerate() + .map(move |(idx, agent_id)| async move { + let user_clone = context.user.clone(); + let bytes = agent_memory_bytes(idx, base); + invoke_and_await_agent( + &user_clone, + &context.component, + agent_id, + allocate_method, + data_value!(bytes), + ) + .await + }) + .collect::>(); + let _ = result_futures.join().await; + } + .instrument(tracing::info_span!( + "warmup_allocate_memory", + agent_count = context.agent_ids.len() + )) + .await; +} + +async fn run( + variant: &SaturationVariant, + context: &SaturationIterationContext, + recorder: BenchmarkRecorder, +) { + let agent_count = context.agent_ids.len(); + let deadline = Instant::now() + RUN_DURATION; + + let result_futures = context + .agent_ids + .iter() + .enumerate() + .map(|(idx, agent_id)| { + let recorder = recorder.clone(); + async move { + let user_clone = context.user.clone(); + + tokio::time::sleep(agent_stagger(idx)).await; + + let mut calls = 0u64; + while Instant::now() < deadline { + let result = invoke_and_await_agent( + &user_clone, + &context.component, + agent_id, + variant.active_method, + (variant.active_params)(), + ) + .await; + result.record(&recorder, "", idx.to_string().as_str()); + calls += 1; + tokio::time::sleep(IDLE_GAP).await; + } + calls + } + }) + .collect::>(); + + let started = Instant::now(); + let per_agent_calls = result_futures.join().await; + let elapsed = started.elapsed(); + + // Aggregate sustained throughput over the fixed run window. Across `size` + // steps, this reveals where added active agents stop adding throughput + // (memory saturation / eviction churn dominates) — the knee we are after. + let total_calls: u64 = per_agent_calls.iter().sum(); + let secs = elapsed.as_secs_f64(); + if secs > 0.0 { + let ops_per_sec = (total_calls as f64 / secs).round() as u64; + info!( + "saturation: {agent_count} agents, {total_calls} calls in {secs:.1}s = {ops_per_sec} ops/sec" + ); + recorder.count( + &ResultKey::primary("saturation-throughput-ops-per-sec"), + ops_per_sec, + ); + } +} + +async fn cleanup_iteration(context: SaturationIterationContext) { + let agent_ids: Vec = context + .agent_ids + .iter() + .filter_map(|agent_id| AgentId::from_agent_id(context.component.id, agent_id).ok()) + .collect(); + delete_workers(&context.user, &agent_ids).await; + cleanup_user_state(&context.user, &context.env_id).await; +} + +/// Generates a `Benchmark` impl wrapper for a saturation variant. +macro_rules! saturation_benchmark { + ($ty:ident, $bench_name:literal, $variant:expr, $description:literal) => { + pub struct $ty { + config: RunConfig, + } + + #[async_trait] + impl Benchmark for $ty { + type BenchmarkContext = SaturationBenchmarkContext; + type IterationContext = SaturationIterationContext; + + fn name() -> &'static str { + $bench_name + } + + fn description() -> &'static str { + indoc! { $description } + } + + async fn create_benchmark_context( + mode: &TestMode, + verbosity: Level, + cluster_size: usize, + disable_compilation_cache: bool, + otlp: bool, + ) -> Self::BenchmarkContext { + create_context( + mode, + verbosity, + cluster_size, + disable_compilation_cache, + otlp, + ) + .await + } + + async fn cleanup(benchmark_context: Self::BenchmarkContext) { + benchmark_context.deps.kill_all().await; + } + + async fn create(_mode: &TestMode, config: RunConfig) -> Self { + Self { config } + } + + async fn setup_iteration( + &self, + benchmark_context: &Self::BenchmarkContext, + ) -> Self::IterationContext { + setup_iteration(&$variant, &self.config, benchmark_context).await + } + + async fn warmup( + &self, + _benchmark_context: &Self::BenchmarkContext, + context: &Self::IterationContext, + ) { + warmup(&$variant, context).await + } + + async fn run( + &self, + _benchmark_context: &Self::BenchmarkContext, + context: &Self::IterationContext, + recorder: BenchmarkRecorder, + ) { + run(&$variant, context, recorder).await + } + + async fn cleanup_iteration( + &self, + _benchmark_context: &Self::BenchmarkContext, + context: Self::IterationContext, + ) { + cleanup_iteration(context).await + } + } + }; +} + +saturation_benchmark!( + ThroughputSaturationCounters, + "throughput-saturation-counters", + COUNTERS_VARIANT, + "Ramps `size` active agents that each retain a deterministic, per-agent-distinct + synthetic memory footprint (controlled by `length`) and do CPU work, measuring + sustained throughput to locate the memory-saturation knee." +); + +saturation_benchmark!( + ThroughputSaturationEchoRust, + "throughput-saturation-echo-rust", + ECHO_RUST_VARIANT, + "Ramps `size` actively-invoked Rust `echo` agents to find how many fit resident + per pod before eviction churn craters throughput. The per-agent footprint is the + agent's natural memory (no synthetic allocation)." +); + +saturation_benchmark!( + ThroughputSaturationEchoTs, + "throughput-saturation-echo-ts", + ECHO_TS_VARIANT, + "Ramps `size` actively-invoked TypeScript `echo` agents to find how many fit + resident per pod before eviction churn craters throughput. The per-agent + footprint is the agent's natural memory, including the QuickJS runtime." +); diff --git a/test-components/agent-counters/src/lib.rs b/test-components/agent-counters/src/lib.rs index b2ac7d4d44..b14840512d 100644 --- a/test-components/agent-counters/src/lib.rs +++ b/test-components/agent-counters/src/lib.rs @@ -3,6 +3,43 @@ pub mod repository; use golem_rust::{agent_definition, agent_implementation, generate_idempotency_key}; +/// Page size used when touching retained memory so the OS backs it with real +/// resident pages rather than leaving it as untouched (non-resident) reservation. +const PAGE_SIZE: usize = 4096; + +/// Spins doing cheap arithmetic for approximately `millis` milliseconds, polling +/// the monotonic clock between batches of work rather than on every iteration so +/// the workload is CPU-bound, not clock-syscall-bound. Returns an accumulated +/// value so the work cannot be optimised away. +fn busy_loop(millis: u32) -> u32 { + let deadline = std::time::Duration::from_millis(millis as u64); + let start = std::time::Instant::now(); + let mut acc: u32 = 0; + loop { + for i in 0..10_000u32 { + acc = acc.wrapping_add(i).wrapping_mul(31).wrapping_add(7); + } + if start.elapsed() >= deadline { + break; + } + } + acc +} + +/// Grows `buffer` to hold `bytes` and touches one byte per page so the memory +/// becomes resident (real RSS), not just reserved address space. +fn retain_memory(buffer: &mut Vec, bytes: u32) { + let bytes = bytes as usize; + buffer.clear(); + buffer.shrink_to_fit(); + buffer.resize(bytes, 0); + let mut page = 0; + while page < bytes { + buffer[page] = buffer[page].wrapping_add(1); + page += PAGE_SIZE; + } +} + #[agent_definition] trait Counter { fn new(id: String) -> Self; @@ -10,17 +47,32 @@ trait Counter { async fn increment_through_rpc(&mut self) -> u32; async fn increment_through_rpc_to_ephemeral(&mut self) -> u32; async fn increment_through_rpc_to_ephemeral_phantom(&mut self) -> u32; + + /// Spins for `millis` milliseconds of cheap CPU work, then increments and + /// returns the counter. Used to define an "active" agent without making the + /// workload oplog-bound on a tight loop. + fn busy_for(&mut self, millis: u32) -> u32; + + /// Retains `bytes` of resident linear memory in the agent's state and + /// increments the counter. The memory stays resident across invocations so + /// the agent contributes a controllable footprint to the executor's pool. + fn allocate_memory(&mut self, bytes: u32) -> u32; } struct CounterImpl { count: u32, id: String, + retained: Vec, } #[agent_implementation] impl Counter for CounterImpl { fn new(id: String) -> Self { - Self { id, count: 0 } + Self { + id, + count: 0, + retained: Vec::new(), + } } fn increment(&mut self) -> u32 { @@ -42,29 +94,64 @@ impl Counter for CounterImpl { let mut client = EphemeralSingletonCounterClient::new_phantom(); client.increment().await } + + fn busy_for(&mut self, millis: u32) -> u32 { + let _ = busy_loop(millis); + self.count += 1; + self.count + } + + fn allocate_memory(&mut self, bytes: u32) -> u32 { + retain_memory(&mut self.retained, bytes); + self.count += 1; + self.count + } } #[agent_definition(ephemeral)] trait EphemeralCounter { fn new(id: String) -> Self; fn increment(&mut self) -> u32; + + /// See [`Counter::busy_for`]. + fn busy_for(&mut self, millis: u32) -> u32; + + /// See [`Counter::allocate_memory`]. + fn allocate_memory(&mut self, bytes: u32) -> u32; } struct EphemeralCounterImpl { count: u32, _id: String, + retained: Vec, } #[agent_implementation] impl EphemeralCounter for EphemeralCounterImpl { fn new(id: String) -> Self { - Self { _id: id, count: 0 } + Self { + _id: id, + count: 0, + retained: Vec::new(), + } } fn increment(&mut self) -> u32 { self.count += 1; self.count } + + fn busy_for(&mut self, millis: u32) -> u32 { + let _ = busy_loop(millis); + self.count += 1; + self.count + } + + fn allocate_memory(&mut self, bytes: u32) -> u32 { + retain_memory(&mut self.retained, bytes); + self.count += 1; + self.count + } } From bfe1b145b5561aa6613b629f58e8969bbacdfd0f Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 00:38:46 -0700 Subject: [PATCH 24/60] test(worker-executor): exercise admission reserve under maximum concurrent overlap --- .../active_workers/admission/tests.rs | 172 +++++++++++++----- 1 file changed, 122 insertions(+), 50 deletions(-) diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs index bd9b51aabb..50d545ddb0 100644 --- a/golem-worker-executor/src/services/active_workers/admission/tests.rs +++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs @@ -435,72 +435,144 @@ async fn usable_ratio_caps_admission_below_full_limit() { ); } -// ── Concurrency: the simultaneous-big-start race ───────────────────────────── +// ── Concurrency ────────────────────────────────────────────────────────────── +// +// In production, each admission reads headroom (`try_admit`) and then separately +// commits to the upstream atomic permit (modeled here as `pinned_usage += +// request`). The two steps are not serialised across concurrent admissions, so +// several admissions can read the same pre-commit snapshot, all pass the check, +// and all commit. The `reserve` margin accounts for this instead of a lock: +// concurrent admissions may push usage above the carve-out ceiling into the +// reserve, but must not push it above the true `limit`. +// +// These tests force the maximum-overlap case with a barrier: every admission +// completes its headroom check before any admission commits. This makes the +// maximum overshoot deterministic rather than dependent on task scheduling, so +// an undersized reserve is reliably detected and a correctly sized one is +// actually exercised. + +/// Run `racers` admissions of `request` bytes against a fresh environment with +/// the given `reserve`, forcing all headroom checks to complete before any +/// commit (maximum overlap). Returns the final environment usage and the number +/// of admits granted. +async fn race_admissions_worst_case( + limit: u64, + initial_pinned: u64, + reserve: u64, + racers: usize, + request: u64, +) -> (u64, usize) { + let state = Arc::new(Mutex::new(EnvState { + limit, + pinned_usage: initial_pinned, + residents: vec![], + ..Default::default() + })); + let ctrl = Arc::new(controller_with_ratio(state.clone(), 1.0, reserve)); + // All racers check before any commits: the maximum-overlap schedule. + let barrier = Arc::new(tokio::sync::Barrier::new(racers)); + + let mut handles = Vec::new(); + for _ in 0..racers { + let ctrl = ctrl.clone(); + let state = state.clone(); + let barrier = barrier.clone(); + handles.push(tokio::spawn(async move { + let source = FakeEvictionSource { + state: state.clone(), + }; + let decision = ctrl.try_admit(request, &source).await; + // Hold every racer here until all have decided against the same + // pre-commit snapshot, then let the commits run together. + barrier.wait().await; + if decision == AdmissionDecision::Admit { + state.lock().unwrap().pinned_usage += request; + true + } else { + false + } + })); + } + let mut admitted = 0; + for h in handles { + if h.await.unwrap() { + admitted += 1; + } + } + let usage = state.lock().unwrap().usage(); + (usage, admitted) +} proptest! { - /// The contract for the safety invariant under concurrency. - /// - /// Many admissions race at once with no external serialisation across the - /// headroom check and the commit (the commit models the upstream atomic - /// permit grant; the check is a separate prior read, so a genuine - /// time-of-check/time-of-use window exists between concurrent tasks). + /// A reserve sized for the maximum concurrent overshoot keeps real usage + /// under the limit even when every racer checks before any commits, with a + /// non-trivial near-ceiling pinned base. /// - /// The invariant: real usage must never exceed the true `limit`. Admissions - /// may collectively overshoot the carve-out ceiling into the reserve — that - /// is what the reserve is for — but never past `limit` itself. The reserve - /// is sized here to cover the worst-case concurrent overshoot (number of - /// racers × max request), so a passing test means the reserve margin is a - /// sufficient substitute for serialising the gate. If this ever fails, the - /// margin is insufficient for the chosen concurrency and the gate's - /// correctness depends on stronger synchronisation. + /// Sizing: at most all `racers` can pass against the same pre-commit + /// snapshot, so the reserve must cover `racers × request` landing in the + /// window between check and commit. With that margin, usage stays + /// `<= limit`. #[test] - fn concurrent_admissions_never_exceed_limit( + fn sufficient_reserve_holds_under_worst_case_overlap( racers in 2usize..16, request in 50u64..400, + base_fill in 0u64..2000, ) { - // Worst case: every racer passes the check against the same snapshot and - // commits. The reserve must cover (racers - 1) extra in-flight requests - // beyond the one the headroom was actually sized for. let reserve = request * racers as u64; - // Ceiling must leave room for at least one request above the reserve. - let limit = reserve + request + 1000; + // Limit leaves room for the pre-existing fill, the reserve, and at least + // one request's worth of admissible headroom above the reserve. + let limit = base_fill + reserve + request + 500; let rt = tokio::runtime::Builder::new_multi_thread() .worker_threads(4) .build() .unwrap(); rt.block_on(async move { - let state = Arc::new(Mutex::new(EnvState { - limit, - pinned_usage: 0, - residents: vec![], - ..Default::default() - })); - let ctrl = Arc::new(controller_with_ratio(state.clone(), 1.0, reserve)); - - let mut handles = Vec::new(); - for _ in 0..racers { - let ctrl = ctrl.clone(); - let state = state.clone(); - handles.push(tokio::spawn(async move { - let source = FakeEvictionSource { state: state.clone() }; - let decision = ctrl.try_admit(request, &source).await; - if decision == AdmissionDecision::Admit { - // Models the atomic permit grant: a single locked - // fetch-add, separate from the (already-completed) check. - state.lock().unwrap().pinned_usage += request; - } - })); - } - for h in handles { - h.await.unwrap(); - } + let (usage, _) = + race_admissions_worst_case(limit, base_fill, reserve, racers, request).await; + prop_assert!( + usage <= limit, + "maximum overlap drove usage {usage} past limit {limit}" + ); + Ok(()) + }).unwrap(); + } - let s = state.lock().unwrap(); + /// With no reserve and maximum overlap forced, several racers admitting at + /// once must push usage above the carve-out ceiling. This confirms the race + /// the design tolerates is real and this harness reproduces it; without it, + /// the safety test above could pass without ever exercising a concurrent + /// overshoot. Usage may still stay under `limit`; the assertion is on the + /// overshoot past the ceiling. + #[test] + fn worst_case_overlap_overshoots_ceiling_without_reserve( + racers in 2usize..12, + request in 50u64..400, + ) { + // Ceiling headroom sized for exactly one request; no reserve cushion. + let ceiling = request; + let limit = request * racers as u64 + 1000; + + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .build() + .unwrap(); + rt.block_on(async move { + // pinned = limit - ceiling so admissible headroom is exactly one + // request; with reserve 0, every racer sees room for itself. + let pinned = limit - ceiling; + let (usage, admitted) = + race_admissions_worst_case(limit, pinned, 0, racers, request).await; + // More than one admit means the gate let concurrent racers through + // on the same snapshot. + prop_assert!( + admitted >= 2, + "expected concurrent over-admission with no reserve, got {admitted} admits" + ); prop_assert!( - s.usage() <= s.limit, - "concurrent admissions drove usage {} past limit {}", - s.usage(), s.limit + usage > ceiling + pinned, + "usage {usage} did not overshoot the ceiling {}", + ceiling + pinned ); Ok(()) }).unwrap(); From c3af739848f079bb71bae2bdce482301894358a3 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 09:08:18 -0700 Subject: [PATCH 25/60] feat(benchmark): longer sustained load, bumpt the number of agents --- .../cloud-density-saturation.yaml | 48 ++++++++++--------- .../src/benchmarks/throughput_saturation.rs | 6 ++- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml index 59d1409efc..c877993cc6 100644 --- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml +++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml @@ -28,37 +28,41 @@ name: cloud-density-saturation benchmarks: + # Synthetic footprint — each agent retains a deterministic per-agent-distinct + # amount of resident memory, exercising the admission/eviction path with a + # controllable footprint near the limit. Run first: this is the variant that + # actually fills memory and drives the gate to its reject/evict path. + # size = number of active, memory-holding agents (the ramp axis) + # length = base per-agent memory footprint in bytes; each agent retains a + # deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base => + # ~72 MiB average per agent, filling a ~10 GiB usable pool around + # ~145 agents. The sweep brackets that ceiling and pushes well past it + # so the admission gate's reject/evict behaviour near OOM is exercised. + - name: throughput-saturation-counters + iterations: 1 + clusterSize: [2] + size: [50, 100, 150, 200, 300, 500] + length: [16777216] + # Rust echo agents — lean per-instance linear memory (the ~900 KB module is # charged once per component, shared across all agents; what scales per agent - # is the small instance heap). Knee expected in the thousands, so sweep high. - # The current admission algorithm craters around ~700 agents/node; the reworked - # admission is expected to push this knee substantially higher. + # is the small instance heap). The previous run reached the top of the sweep + # (12000) without saturating pod memory, so the knee here is throughput / + # eviction-churn rather than memory. Dropped the low points that told us + # nothing and pushed the range up with coarser steps. - name: throughput-saturation-echo-rust - iterations: 3 + iterations: 1 clusterSize: [2] - size: [500, 1000, 2000, 4000, 8000, 12000] + size: [4000, 8000, 16000, 24000, 32000] length: [0] # TypeScript echo agents — each instance instantiates its own QuickJS runtime # and JS heap in its own linear memory (the 17.4 MB module is shared once per # component; the per-instance runtime state is the heavy per-agent cost). - # Expect the knee in the hundreds, well below the Rust variant. + # Heavier per agent than the Rust variant, so a lower knee — but the previous + # run reached 2000 without saturating, so push higher and drop the low points. - name: throughput-saturation-echo-ts - iterations: 3 + iterations: 1 clusterSize: [2] - size: [100, 250, 500, 750, 1000, 1500, 2000] + size: [1000, 2000, 4000, 6000, 8000] length: [0] - - # Synthetic footprint — each agent retains a deterministic per-agent-distinct - # amount of resident memory, exercising the admission/eviction path with a - # controllable footprint near the limit. - # size = number of active, memory-holding agents (the ramp axis) - # length = base per-agent memory footprint in bytes; each agent retains a - # deterministic multiple (1x..8x), averaging ~4.5x. 4 MiB base => - # ~18 MiB average per agent, filling a ~10 GiB usable pool around - # ~580 agents (bracketing the old ~700 crater point). - - name: throughput-saturation-counters - iterations: 3 - clusterSize: [2] - size: [100, 250, 500, 750, 1000, 1500] - length: [4194304] diff --git a/integration-tests/src/benchmarks/throughput_saturation.rs b/integration-tests/src/benchmarks/throughput_saturation.rs index 665ff3ec48..44568614a9 100644 --- a/integration-tests/src/benchmarks/throughput_saturation.rs +++ b/integration-tests/src/benchmarks/throughput_saturation.rs @@ -78,8 +78,10 @@ const IDLE_GAP: Duration = Duration::from_millis(200); /// Total measured wall-clock duration of the sustained-load phase. Throughput /// and churn are measured over this fixed window so steps with different `size` -/// are comparable. -const RUN_DURATION: Duration = Duration::from_secs(30); +/// are comparable. Held long enough that the high-residency plateau persists for +/// at least a minute, so steady-state behaviour at the memory ceiling (not just +/// the initial burst) is observed. +const RUN_DURATION: Duration = Duration::from_secs(90); /// Maximum per-agent start stagger, so the fleet is not synchronised: at any /// instant some agents are mid-call (demanding memory) while others sit idle From 7dcb2d3d4d39030b72a074945bd1cbcb2bafa000 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 09:36:17 -0700 Subject: [PATCH 26/60] fix: add empty workspace --- test-components/agent-counters/Cargo.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test-components/agent-counters/Cargo.toml b/test-components/agent-counters/Cargo.toml index c7567da5a5..069f9180f3 100644 --- a/test-components/agent-counters/Cargo.toml +++ b/test-components/agent-counters/Cargo.toml @@ -3,6 +3,12 @@ name = "it_agent_counters" version = "0.0.1" edition = "2024" +# Standalone workspace root: this component is excluded from the golem-oss +# workspace, and when built nested inside another repo's workspace (e.g. the +# cloud-perf CI checkout under golem-cloud) cargo would otherwise walk up and +# attach it to that unrelated workspace. An empty table stops that search. +[workspace] + [profile.release] opt-level = "s" lto = true From 139aed5535a36355009627d6a36d125708cf16cb Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 09:59:07 -0700 Subject: [PATCH 27/60] fix: use snake case as method names --- integration-tests/src/benchmarks/throughput_saturation.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration-tests/src/benchmarks/throughput_saturation.rs b/integration-tests/src/benchmarks/throughput_saturation.rs index 44568614a9..768d8c7eb1 100644 --- a/integration-tests/src/benchmarks/throughput_saturation.rs +++ b/integration-tests/src/benchmarks/throughput_saturation.rs @@ -127,9 +127,9 @@ const COUNTERS_VARIANT: SaturationVariant = SaturationVariant { wasm_name: "it_agent_counters_release", component_name: "it:agent-counters", agent_type: "Counter", - active_method: "busy-for", + active_method: "busy_for", active_params: || data_value!(BUSY_MILLIS), - allocate_method: Some("allocate-memory"), + allocate_method: Some("allocate_memory"), }; const ECHO_RUST_VARIANT: SaturationVariant = SaturationVariant { From 442c1c5b0442a00a9d88197db927ab08462c3cfe Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 12:18:22 -0700 Subject: [PATCH 28/60] chore: 300 already saturates, no need for 500 --- .../benchmark_suites/cloud-density-saturation.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml index c877993cc6..c6cc81c813 100644 --- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml +++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml @@ -41,7 +41,7 @@ benchmarks: - name: throughput-saturation-counters iterations: 1 clusterSize: [2] - size: [50, 100, 150, 200, 300, 500] + size: [50, 100, 150, 200, 300] length: [16777216] # Rust echo agents — lean per-instance linear memory (the ~900 KB module is From 4bbb200ebb3bc8a5cb2ad9e680b23fca4dd4c50d Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 14:04:47 -0700 Subject: [PATCH 29/60] fix(worker-executor): avoid deadlock between memory grow and admission eviction scan --- .../active_workers/admission/tests.rs | 163 ++++++++++++++++++ golem-worker-executor/src/worker/mod.rs | 38 ++-- 2 files changed, 191 insertions(+), 10 deletions(-) diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs index 50d545ddb0..8eca9e157c 100644 --- a/golem-worker-executor/src/services/active_workers/admission/tests.rs +++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs @@ -578,3 +578,166 @@ proptest! { }).unwrap(); } } + +/// Concurrent memory grows must not deadlock against the admission eviction +/// scan. +/// +/// A memory grow acquires a permit while the growing worker holds its own +/// instance lock, and the admission slow path scans the worker set, taking each +/// other worker's instance lock to classify it for eviction. With many workers +/// growing at once under memory pressure these two must not form an AB-BA cycle. +/// Workloads that never grow memory never exercise this path. +mod grow_lock_ordering { + use super::super::{AdmissionController, AdmissionPolicy, EvictionPriority, EvictionSource}; + use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot}; + use std::sync::Arc; + use std::time::Duration; + use test_r::test; + use tokio::sync::Mutex as AsyncMutex; + + /// Per-worker lock, standing in for `Worker::instance`. + type WorkerLock = Arc>; + + /// Probe pinned to zero admissible headroom so `try_admit` takes the slow + /// (scanning) path, modelling the moment a grow's requested delta does not + /// fit the current headroom. + #[derive(Debug)] + struct SaturatedProbe; + + impl MemoryProbe for SaturatedProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: 1, + current_bytes: u64::MAX, + } + } + } + + /// Probe reporting ample headroom so `try_admit` takes the fast path and + /// never scans — the same grow code path, but not under memory pressure. + #[derive(Debug)] + struct AmpleHeadroomProbe; + + impl MemoryProbe for AmpleHeadroomProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: u64::MAX, + current_bytes: 0, + } + } + } + + /// Eviction source that, like `evict_at_most_memory`, scans every worker and + /// takes each worker's instance lock (via `eviction_class`) to classify it. + /// Frees nothing (all workers active). The lock on each worker is held only + /// briefly, faithfully — the deadlock comes from the ordering, not hold time. + struct ScanningEvictionSource { + workers: Vec, + } + + #[async_trait::async_trait] + impl EvictionSource for ScanningEvictionSource { + async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 { + for worker in &self.workers { + let _guard = worker.lock().await; + } + 0 + } + } + + /// Models the grow path's lock interaction: run the admission scan, which + /// takes other workers' instance locks, without holding this worker's own + /// instance lock, then take it afterwards to merge the permit (as + /// `Worker::increase_memory` does). + async fn grow_then_lock( + controller: &AdmissionController, + own: &WorkerLock, + workers: Vec, + ) { + let source = ScanningEvictionSource { workers }; + controller.try_admit(1, &source).await; + let _own_guard = own.lock().await; + } + + fn workers(n: usize) -> Vec { + (0..n).map(|_| Arc::new(AsyncMutex::new(()))).collect() + } + + fn controller(probe: Box) -> Arc { + Arc::new(AdmissionController::new( + probe, + AdmissionPolicy { + usable_ratio: 1.0, + reserve_bytes: 0, + }, + )) + } + + /// Many workers growing concurrently under memory pressure (every grow takes + /// the scanning slow path) must all complete without deadlocking. + #[test(flavor = "multi_thread", worker_threads = 4)] + async fn concurrent_grows_do_not_deadlock_under_pressure() { + const WORKERS: usize = 32; + const DEADLINE: Duration = Duration::from_secs(10); + + let workers = workers(WORKERS); + let controller = controller(Box::new(SaturatedProbe)); + + let mut grows = Vec::new(); + for i in 0..WORKERS { + let controller = controller.clone(); + let all = workers.clone(); + let own = workers[i].clone(); + grows.push(tokio::spawn(async move { + grow_then_lock(&controller, &own, all).await; + })); + } + + let all_done = async { + for task in grows { + let _ = task.await; + } + }; + + let result = tokio::time::timeout(DEADLINE, all_done).await; + assert!( + result.is_ok(), + "concurrent grows deadlocked: the scan must not run while a worker holds its own instance lock" + ); + } + + /// With comfortable headroom the gate admits on the fast path without + /// scanning, so no worker's instance lock is taken during admission and + /// concurrent grows complete. Confirms the deadlock risk is specific to the + /// scan-under-pressure path. + #[test(flavor = "multi_thread", worker_threads = 4)] + async fn no_deadlock_with_ample_headroom() { + const WORKERS: usize = 32; + const DEADLINE: Duration = Duration::from_secs(10); + + let workers = workers(WORKERS); + let controller = controller(Box::new(AmpleHeadroomProbe)); + + let mut grows = Vec::new(); + for i in 0..WORKERS { + let controller = controller.clone(); + let all = workers.clone(); + let own = workers[i].clone(); + grows.push(tokio::spawn(async move { + grow_then_lock(&controller, &own, all).await; + })); + } + + let all_done = async { + for task in grows { + let _ = task.await; + } + }; + + let result = tokio::time::timeout(DEADLINE, all_done).await; + assert!( + result.is_ok(), + "grows with ample headroom should not scan and should not deadlock" + ); + } +} diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs index e9e8dbed8f..efd692f7c4 100644 --- a/golem-worker-executor/src/worker/mod.rs +++ b/golem-worker-executor/src/worker/mod.rs @@ -985,19 +985,37 @@ impl Worker { // Should only be called from invocation loop pub async fn increase_memory(&self, delta: u64) -> anyhow::Result<()> { + // The instance lock must not be held while acquiring memory permits: + // permit acquisition runs the admission eviction scan, which takes other + // workers' instance locks. Holding this worker's instance lock across + // that scan while another growing worker does the same is an AB-BA + // deadlock. So acquire the permit without the lock, then re-lock only to + // merge it into the running worker. + match &*self.instance.lock().await { + WorkerInstance::Running(_) => {} + WorkerInstance::Stopping(_) + | WorkerInstance::WaitingForPermit(_) + | WorkerInstance::Unloaded { .. } + | WorkerInstance::Deleting => return Ok(()), + } + + let Some(new_permits) = self.active_workers().try_acquire(delta).await else { + return Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory)); + }; + + // Re-check state under the lock: the worker may have changed state while + // permits were being acquired. If it is no longer running, drop the + // permits (returned to the pool on drop) and treat as a no-op, matching + // the non-running arms above. match &mut *self.instance.lock().await { WorkerInstance::Running(running) => { - if let Some(new_permits) = self.active_workers().try_acquire(delta).await { - running.merge_extra_permits(new_permits); - Ok(()) - } else { - Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory)) - } + running.merge_extra_permits(new_permits); + Ok(()) } - WorkerInstance::Stopping(_) => Ok(()), - WorkerInstance::WaitingForPermit(_) => Ok(()), - WorkerInstance::Unloaded { .. } => Ok(()), - WorkerInstance::Deleting => Ok(()), + WorkerInstance::Stopping(_) + | WorkerInstance::WaitingForPermit(_) + | WorkerInstance::Unloaded { .. } + | WorkerInstance::Deleting => Ok(()), } } From be19cf460efe2c905cce2cb005eb1f58a22d4c03 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 19:00:01 -0700 Subject: [PATCH 30/60] feat: change order of tests --- .../cloud-density-saturation.yaml | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml index c6cc81c813..8522d57d23 100644 --- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml +++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml @@ -28,22 +28,6 @@ name: cloud-density-saturation benchmarks: - # Synthetic footprint — each agent retains a deterministic per-agent-distinct - # amount of resident memory, exercising the admission/eviction path with a - # controllable footprint near the limit. Run first: this is the variant that - # actually fills memory and drives the gate to its reject/evict path. - # size = number of active, memory-holding agents (the ramp axis) - # length = base per-agent memory footprint in bytes; each agent retains a - # deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base => - # ~72 MiB average per agent, filling a ~10 GiB usable pool around - # ~145 agents. The sweep brackets that ceiling and pushes well past it - # so the admission gate's reject/evict behaviour near OOM is exercised. - - name: throughput-saturation-counters - iterations: 1 - clusterSize: [2] - size: [50, 100, 150, 200, 300] - length: [16777216] - # Rust echo agents — lean per-instance linear memory (the ~900 KB module is # charged once per component, shared across all agents; what scales per agent # is the small instance heap). The previous run reached the top of the sweep @@ -66,3 +50,19 @@ benchmarks: clusterSize: [2] size: [1000, 2000, 4000, 6000, 8000] length: [0] + + # Synthetic footprint — each agent retains a deterministic per-agent-distinct + # amount of resident memory, exercising the admission/eviction path with a + # controllable footprint near the limit. Run first: this is the variant that + # actually fills memory and drives the gate to its reject/evict path. + # size = number of active, memory-holding agents (the ramp axis) + # length = base per-agent memory footprint in bytes; each agent retains a + # deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base => + # ~72 MiB average per agent, filling a ~10 GiB usable pool around + # ~145 agents. The sweep brackets that ceiling and pushes well past it + # so the admission gate's reject/evict behaviour near OOM is exercised. + - name: throughput-saturation-counters + iterations: 1 + clusterSize: [2] + size: [50, 100, 150, 200, 300] + length: [16777216] From 21fd401f29847eb12bb16bbe0a16aa6c59bc447a Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 19:05:06 -0700 Subject: [PATCH 31/60] feat: restore iterations count to 3 --- .../benchmark_suites/cloud-density-saturation.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml index 8522d57d23..0b749bc743 100644 --- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml +++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml @@ -35,7 +35,7 @@ benchmarks: # eviction-churn rather than memory. Dropped the low points that told us # nothing and pushed the range up with coarser steps. - name: throughput-saturation-echo-rust - iterations: 1 + iterations: 3 clusterSize: [2] size: [4000, 8000, 16000, 24000, 32000] length: [0] @@ -46,7 +46,7 @@ benchmarks: # Heavier per agent than the Rust variant, so a lower knee — but the previous # run reached 2000 without saturating, so push higher and drop the low points. - name: throughput-saturation-echo-ts - iterations: 1 + iterations: 3 clusterSize: [2] size: [1000, 2000, 4000, 6000, 8000] length: [0] @@ -62,7 +62,7 @@ benchmarks: # ~145 agents. The sweep brackets that ceiling and pushes well past it # so the admission gate's reject/evict behaviour near OOM is exercised. - name: throughput-saturation-counters - iterations: 1 + iterations: 3 clusterSize: [2] size: [50, 100, 150, 200, 300] length: [16777216] From a9285c064cf09dce1c7281722dece4fcce0c4e0f Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 22:37:08 -0700 Subject: [PATCH 32/60] refactor(worker-executor): make cgroup gate primary, semaphore clamped second line --- .../config/debug-worker-executor.sample.env | 6 +- .../config/debug-worker-executor.toml | 6 +- .../config/worker-executor.sample.env | 9 +- .../config/worker-executor.toml | 9 +- .../services/active_workers/admission/mod.rs | 29 +-- .../active_workers/admission/tests.rs | 208 ++---------------- .../services/active_workers/memory_probe.rs | 21 +- .../src/services/active_workers/mod.rs | 40 +++- .../src/services/golem_config.rs | 63 ++++-- 9 files changed, 137 insertions(+), 254 deletions(-) diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env index 7d95b6f7dc..3c87d1275c 100644 --- a/golem-debugging-service/config/debug-worker-executor.sample.env +++ b/golem-debugging-service/config/debug-worker-executor.sample.env @@ -55,11 +55,12 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" -GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 +GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9 +GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s" @@ -231,11 +232,12 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" -GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 +GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9 +GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s" diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml index 316dddd29a..82e5dbdc92 100644 --- a/golem-debugging-service/config/debug-worker-executor.toml +++ b/golem-debugging-service/config/debug-worker-executor.toml @@ -96,10 +96,11 @@ max_oplog_query_pages_size = 100 [memory] acquire_retry_delay = "500ms" -admission_reserve_bytes = 268435456 component_size_coefficient = 2.0 enable_measured_admission = true worker_estimate_coefficient = 1.1 +worker_memory_max_safe_ratio = 0.9 +worker_memory_overcommit_ratio = 1.2 worker_memory_ratio = 0.8 [memory.oom_retry_config] @@ -367,10 +368,11 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" -# admission_reserve_bytes = 268435456 # component_size_coefficient = 2.0 # enable_measured_admission = true # worker_estimate_coefficient = 1.1 +# worker_memory_max_safe_ratio = 0.9 +# worker_memory_overcommit_ratio = 1.2 # worker_memory_ratio = 0.8 # # [memory.oom_retry_config] diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env index bc7bf2c3c0..4cd9a25b12 100644 --- a/golem-worker-executor/config/worker-executor.sample.env +++ b/golem-worker-executor/config/worker-executor.sample.env @@ -72,11 +72,12 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" -GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 +GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9 +GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s" @@ -294,11 +295,12 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" -GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 +GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9 +GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s" @@ -486,11 +488,12 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" -GOLEM__MEMORY__ADMISSION_RESERVE_BYTES=268435456 GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 +GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9 +GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s" diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml index 819c4fe03d..f5a8cd0183 100644 --- a/golem-worker-executor/config/worker-executor.toml +++ b/golem-worker-executor/config/worker-executor.toml @@ -125,10 +125,11 @@ max_oplog_query_pages_size = 100 [memory] acquire_retry_delay = "500ms" -admission_reserve_bytes = 268435456 component_size_coefficient = 2.0 enable_measured_admission = true worker_estimate_coefficient = 1.1 +worker_memory_max_safe_ratio = 0.9 +worker_memory_overcommit_ratio = 1.2 worker_memory_ratio = 0.8 [memory.oom_retry_config] @@ -459,10 +460,11 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" -# admission_reserve_bytes = 268435456 # component_size_coefficient = 2.0 # enable_measured_admission = true # worker_estimate_coefficient = 1.1 +# worker_memory_max_safe_ratio = 0.9 +# worker_memory_overcommit_ratio = 1.2 # worker_memory_ratio = 0.8 # # [memory.oom_retry_config] @@ -763,10 +765,11 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" -# admission_reserve_bytes = 268435456 # component_size_coefficient = 2.0 # enable_measured_admission = true # worker_estimate_coefficient = 1.1 +# worker_memory_max_safe_ratio = 0.9 +# worker_memory_overcommit_ratio = 1.2 # worker_memory_ratio = 0.8 # # [memory.oom_retry_config] diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs index 702dc003e7..d89f710859 100644 --- a/golem-worker-executor/src/services/active_workers/admission/mod.rs +++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs @@ -16,11 +16,13 @@ //! //! Gates worker admission on the executor environment's *real* memory headroom //! read from the [`MemoryProbe`], rather than on the estimate-based semaphore in -//! [`super::ActiveWorkers`]. The two work together: the semaphore is a cheap, -//! high-frequency pre-filter over reserved-but-not-yet-resident intent; this -//! controller is the authoritative check against measured resident usage. When -//! headroom is short it evicts already-resident idle-then-warm work; if it still -//! cannot make room it rejects rather than over-committing. +//! [`super::ActiveWorkers`]. This controller is the primary, authoritative +//! check against measured resident usage and refuses admission in normal +//! operation; the estimate semaphore is the second line of defence behind it, +//! its atomic permit acquisition catching the concurrent admissions this +//! (lockless) controller can let through on the same snapshot. When headroom is +//! short it evicts already-resident idle-then-warm work; if it still cannot make +//! room it rejects rather than over-committing. //! //! The controller is decoupled from `Worker`/wasmtime via the [`EvictionSource`] //! trait so its decision logic can be exercised in isolation with synthetic @@ -67,23 +69,14 @@ pub enum AdmissionDecision { /// Configuration for the headroom-based admission decision. /// -/// Two knobs with distinct jobs: -/// /// * `usable_ratio` — fraction of the measured limit usable for WASM admission. /// The remainder is left for the host (the executor process, allocator /// arenas, runtime buffers). Mirrors `worker_memory_ratio`, but applied to the /// measured limit rather than the configured total. -/// -/// * `reserve_bytes` — margin kept free below the carve-out ceiling to absorb -/// the window in which concurrent admissions are observed before becoming -/// resident. Its sufficiency under concurrency is asserted by the property -/// test in `tests.rs`. #[derive(Debug, Clone, Copy)] pub struct AdmissionPolicy { /// Fraction (0.0..=1.0) of the measured limit usable for WASM admission. pub usable_ratio: f64, - /// Dynamic safety margin kept free below the carve-out ceiling. - pub reserve_bytes: u64, } /// Decides admission against measured headroom, evicting resident idle/warm @@ -100,14 +93,12 @@ impl AdmissionController { } /// Bytes available for new admissions: the carve-out ceiling - /// (`usable_ratio × limit`) minus current usage minus the reserve. - /// Saturating throughout — never underflows when already over a ceiling. + /// (`usable_ratio × limit`) minus current usage. Saturating — never + /// underflows when already over the ceiling. fn admissible_headroom(&self) -> u64 { let snapshot = self.probe.snapshot(); let ceiling = (snapshot.limit_bytes as f64 * self.policy.usable_ratio) as u64; - ceiling - .saturating_sub(snapshot.current_bytes) - .saturating_sub(self.policy.reserve_bytes) + ceiling.saturating_sub(snapshot.current_bytes) } /// Decide whether `request_bytes` can be admitted, evicting from `source` if diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs index 8eca9e157c..4996b97be7 100644 --- a/golem-worker-executor/src/services/active_workers/admission/tests.rs +++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs @@ -109,23 +109,16 @@ impl EvictionSource for FakeEvictionSource { } } -fn controller(state: Arc>, reserve_bytes: u64) -> AdmissionController { - controller_with_ratio(state, 1.0, reserve_bytes) +fn controller(state: Arc>) -> AdmissionController { + controller_with_ratio(state, 1.0) } -fn controller_with_ratio( - state: Arc>, - usable_ratio: f64, - reserve_bytes: u64, -) -> AdmissionController { +fn controller_with_ratio(state: Arc>, usable_ratio: f64) -> AdmissionController { AdmissionController::new( Box::new(FakeProbe { state: state.clone(), }), - AdmissionPolicy { - usable_ratio, - reserve_bytes, - }, + AdmissionPolicy { usable_ratio }, ) } @@ -156,7 +149,7 @@ async fn admits_when_headroom_is_ample_without_evicting() { }], ..Default::default() })); - let ctrl = controller(state.clone(), 0); + let ctrl = controller(state.clone()); let source = FakeEvictionSource { state: state.clone(), }; @@ -186,7 +179,7 @@ async fn evicts_idle_before_warm() { })); // usage = 800, limit = 1000, headroom = 200. Request 300 → shortfall 100. // One idle (400) covers it; warm must remain untouched. - let ctrl = controller(state.clone(), 0); + let ctrl = controller(state.clone()); let source = FakeEvictionSource { state: state.clone(), }; @@ -208,7 +201,7 @@ async fn rejects_when_nothing_can_be_freed() { residents: vec![], ..Default::default() })); - let ctrl = controller(state.clone(), 0); + let ctrl = controller(state.clone()); let source = FakeEvictionSource { state: state.clone(), }; @@ -219,31 +212,6 @@ async fn rejects_when_nothing_can_be_freed() { assert_eq!(state.lock().unwrap().usage(), 950); } -#[test] -async fn reserve_is_kept_free() { - let state = Arc::new(Mutex::new(EnvState { - limit: 1000, - pinned_usage: 700, - residents: vec![], - ..Default::default() - })); - // headroom = 300, reserve = 200 → admissible = 100. Request 150 → reject. - let ctrl = controller(state.clone(), 200); - let source = FakeEvictionSource { - state: state.clone(), - }; - - assert_eq!( - apply_admit(&ctrl, &source, &state, 150).await, - AdmissionDecision::Reject - ); - // But a request within the admissible window succeeds. - assert_eq!( - apply_admit(&ctrl, &source, &state, 100).await, - AdmissionDecision::Admit - ); -} - // ── Property tests ─────────────────────────────────────────────────────────── #[derive(Debug, Clone)] @@ -295,12 +263,11 @@ fn arb_fitting_state( proptest! { /// Safety invariant: across any random sequence of admits — with random - /// pre-resident work, random sizes, and a random reserve — modeled usage - /// must never exceed the limit. This is the property that rules out OOM. + /// pre-resident work and random sizes — modeled usage must never exceed the + /// limit. This is the property that rules out OOM. #[test] fn usage_never_exceeds_limit( (limit, residents) in arb_fitting_state(500..5000, 20), - reserve in 0u64..300, ops in arb_ops(), ) { let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); @@ -311,7 +278,7 @@ proptest! { residents, ..Default::default() })); - let ctrl = controller(state.clone(), reserve); + let ctrl = controller(state.clone()); let source = FakeEvictionSource { state: state.clone() }; for op in ops { @@ -350,7 +317,7 @@ proptest! { residents, ..Default::default() })); - let ctrl = controller(state.clone(), 0); + let ctrl = controller(state.clone()); let source = FakeEvictionSource { state: state.clone() }; for op in ops { @@ -381,7 +348,7 @@ proptest! { residents, ..Default::default() })); - let ctrl = controller(state.clone(), 0); + let ctrl = controller(state.clone()); let source = FakeEvictionSource { state: state.clone() }; for op in ops { @@ -420,7 +387,7 @@ async fn usable_ratio_caps_admission_below_full_limit() { })); // ceiling = 0.8 * 1000 = 800. Request 850 must be rejected even though the // raw limit (1000) would allow it — the top 20% is reserved for the host. - let ctrl = controller_with_ratio(state.clone(), 0.8, 0); + let ctrl = controller_with_ratio(state.clone(), 0.8); let source = FakeEvictionSource { state: state.clone(), }; @@ -435,150 +402,6 @@ async fn usable_ratio_caps_admission_below_full_limit() { ); } -// ── Concurrency ────────────────────────────────────────────────────────────── -// -// In production, each admission reads headroom (`try_admit`) and then separately -// commits to the upstream atomic permit (modeled here as `pinned_usage += -// request`). The two steps are not serialised across concurrent admissions, so -// several admissions can read the same pre-commit snapshot, all pass the check, -// and all commit. The `reserve` margin accounts for this instead of a lock: -// concurrent admissions may push usage above the carve-out ceiling into the -// reserve, but must not push it above the true `limit`. -// -// These tests force the maximum-overlap case with a barrier: every admission -// completes its headroom check before any admission commits. This makes the -// maximum overshoot deterministic rather than dependent on task scheduling, so -// an undersized reserve is reliably detected and a correctly sized one is -// actually exercised. - -/// Run `racers` admissions of `request` bytes against a fresh environment with -/// the given `reserve`, forcing all headroom checks to complete before any -/// commit (maximum overlap). Returns the final environment usage and the number -/// of admits granted. -async fn race_admissions_worst_case( - limit: u64, - initial_pinned: u64, - reserve: u64, - racers: usize, - request: u64, -) -> (u64, usize) { - let state = Arc::new(Mutex::new(EnvState { - limit, - pinned_usage: initial_pinned, - residents: vec![], - ..Default::default() - })); - let ctrl = Arc::new(controller_with_ratio(state.clone(), 1.0, reserve)); - // All racers check before any commits: the maximum-overlap schedule. - let barrier = Arc::new(tokio::sync::Barrier::new(racers)); - - let mut handles = Vec::new(); - for _ in 0..racers { - let ctrl = ctrl.clone(); - let state = state.clone(); - let barrier = barrier.clone(); - handles.push(tokio::spawn(async move { - let source = FakeEvictionSource { - state: state.clone(), - }; - let decision = ctrl.try_admit(request, &source).await; - // Hold every racer here until all have decided against the same - // pre-commit snapshot, then let the commits run together. - barrier.wait().await; - if decision == AdmissionDecision::Admit { - state.lock().unwrap().pinned_usage += request; - true - } else { - false - } - })); - } - let mut admitted = 0; - for h in handles { - if h.await.unwrap() { - admitted += 1; - } - } - let usage = state.lock().unwrap().usage(); - (usage, admitted) -} - -proptest! { - /// A reserve sized for the maximum concurrent overshoot keeps real usage - /// under the limit even when every racer checks before any commits, with a - /// non-trivial near-ceiling pinned base. - /// - /// Sizing: at most all `racers` can pass against the same pre-commit - /// snapshot, so the reserve must cover `racers × request` landing in the - /// window between check and commit. With that margin, usage stays - /// `<= limit`. - #[test] - fn sufficient_reserve_holds_under_worst_case_overlap( - racers in 2usize..16, - request in 50u64..400, - base_fill in 0u64..2000, - ) { - let reserve = request * racers as u64; - // Limit leaves room for the pre-existing fill, the reserve, and at least - // one request's worth of admissible headroom above the reserve. - let limit = base_fill + reserve + request + 500; - - let rt = tokio::runtime::Builder::new_multi_thread() - .worker_threads(4) - .build() - .unwrap(); - rt.block_on(async move { - let (usage, _) = - race_admissions_worst_case(limit, base_fill, reserve, racers, request).await; - prop_assert!( - usage <= limit, - "maximum overlap drove usage {usage} past limit {limit}" - ); - Ok(()) - }).unwrap(); - } - - /// With no reserve and maximum overlap forced, several racers admitting at - /// once must push usage above the carve-out ceiling. This confirms the race - /// the design tolerates is real and this harness reproduces it; without it, - /// the safety test above could pass without ever exercising a concurrent - /// overshoot. Usage may still stay under `limit`; the assertion is on the - /// overshoot past the ceiling. - #[test] - fn worst_case_overlap_overshoots_ceiling_without_reserve( - racers in 2usize..12, - request in 50u64..400, - ) { - // Ceiling headroom sized for exactly one request; no reserve cushion. - let ceiling = request; - let limit = request * racers as u64 + 1000; - - let rt = tokio::runtime::Builder::new_multi_thread() - .worker_threads(4) - .build() - .unwrap(); - rt.block_on(async move { - // pinned = limit - ceiling so admissible headroom is exactly one - // request; with reserve 0, every racer sees room for itself. - let pinned = limit - ceiling; - let (usage, admitted) = - race_admissions_worst_case(limit, pinned, 0, racers, request).await; - // More than one admit means the gate let concurrent racers through - // on the same snapshot. - prop_assert!( - admitted >= 2, - "expected concurrent over-admission with no reserve, got {admitted} admits" - ); - prop_assert!( - usage > ceiling + pinned, - "usage {usage} did not overshoot the ceiling {}", - ceiling + pinned - ); - Ok(()) - }).unwrap(); - } -} - /// Concurrent memory grows must not deadlock against the admission eviction /// scan. /// @@ -666,10 +489,7 @@ mod grow_lock_ordering { fn controller(probe: Box) -> Arc { Arc::new(AdmissionController::new( probe, - AdmissionPolicy { - usable_ratio: 1.0, - reserve_bytes: 0, - }, + AdmissionPolicy { usable_ratio: 1.0 }, )) } diff --git a/golem-worker-executor/src/services/active_workers/memory_probe.rs b/golem-worker-executor/src/services/active_workers/memory_probe.rs index 346a3dd363..6a26b3dd25 100644 --- a/golem-worker-executor/src/services/active_workers/memory_probe.rs +++ b/golem-worker-executor/src/services/active_workers/memory_probe.rs @@ -47,9 +47,10 @@ impl MemorySnapshot { } } -/// Reads the executor environment's real memory state. Cheap enough to sample -/// at admission time, but not on every wasmtime `memory.grow` (that is what the -/// estimate-semaphore pre-check absorbs). +/// Reads the executor environment's real memory state. Sampled at every +/// admission attempt, including each wasmtime `memory.grow`, so it must be +/// cheap: the cgroup v2 backend is two small file reads independent of the +/// number of resident workers. pub trait MemoryProbe: Send + Sync + Debug { fn snapshot(&self) -> MemorySnapshot; @@ -172,6 +173,10 @@ impl MemoryProbe for CgroupV2Probe { /// (falling back to host RAM / process RSS otherwise). pub fn default_probe(memory_override: Option) -> Box { if let Some(limit) = memory_override { + tracing::info!( + limit_bytes = limit, + "Memory probe: ProcessRssProbe (limit pinned by system_memory_override)" + ); return Box::new(ProcessRssProbe::new(limit)); } @@ -184,8 +189,18 @@ pub fn default_probe(memory_override: Option) -> Box { #[cfg(target_os = "linux")] { if let Some(probe) = CgroupV2Probe::try_new(host_ram) { + let snapshot = probe.snapshot(); + tracing::info!( + limit_bytes = snapshot.limit_bytes, + current_bytes = snapshot.current_bytes, + "Memory probe: CgroupV2Probe (cgroup memory.max/current)" + ); return Box::new(probe); } } + tracing::info!( + limit_bytes = host_ram, + "Memory probe: ProcessRssProbe (host RAM, no cgroup v2 limit)" + ); Box::new(ProcessRssProbe::new(host_ram)) } diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs index 7e95da1703..0b8e02fa38 100644 --- a/golem-worker-executor/src/services/active_workers/mod.rs +++ b/golem-worker-executor/src/services/active_workers/mod.rs @@ -80,10 +80,12 @@ pub struct ActiveWorkers { acquire_retry_delay: Duration, /// Authoritative measured-headroom admission gate. Decides whether real /// memory headroom permits a new acquisition, evicting via the worker set - /// when short. The estimate-based `worker_memory` semaphore is the cheap - /// pre-filter and atomic commit in front of it. `None` when measured - /// admission is disabled (e.g. shared test environments) — admission then - /// relies on the estimate semaphore alone. + /// when short, and is what refuses admission in normal operation. The + /// estimate-based `worker_memory` semaphore is the second line of defence + /// behind it: its atomic permit acquisition catches the concurrent + /// admissions the lockless gate can let through on the same snapshot. `None` + /// when measured admission is disabled (e.g. shared test environments) — + /// admission then relies on the estimate semaphore alone. admission: Option, /// Charges each resident component's compiled module size to the estimate /// pool exactly once (shared across all its workers) rather than per worker. @@ -138,13 +140,14 @@ impl Drop for WorkerMemoryPermit { impl ActiveWorkers { pub fn new(memory_config: &MemoryConfig, storage_config: &FilesystemStorageConfig) -> Self { - let worker_memory_size = memory_config.worker_memory(); - let admission = memory_config.enable_measured_admission.then(|| { - AdmissionController::new( - default_probe(memory_config.system_memory_override), - memory_config.admission_policy(), - ) - }); + // Build the probe once and size both admission layers from its reported + // limit, so the estimate semaphore and the measured-headroom gate share + // a single basis (the pod's cgroup limit when constrained, not host RAM). + let probe = default_probe(memory_config.system_memory_override); + let worker_memory_size = memory_config.worker_memory_for_limit(probe.limit_bytes()); + let admission = memory_config + .enable_measured_admission + .then(|| AdmissionController::new(probe, memory_config.admission_policy())); let workers = Cache::new( None, FullCacheEvictionMode::None, @@ -273,6 +276,15 @@ impl ActiveWorkers { .expect("requested memory size is too large"); loop { + // Blocking acquire: retry until the request can be admitted. A + // rejection here is transient, not terminal. The gate reads resident + // memory from the probe, which lags real usage (cgroup + // `memory.current` only counts already-touched pages), so a worker + // admitted earlier may not yet be fully resident; pressure eases as + // its pages settle and as other workers finish and release pool + // permits. Each iteration backs off, re-reads the gate, and re-tries + // the pool, so the caller eventually proceeds once headroom recovers + // rather than failing under momentary pressure. // Authoritative measured-headroom gate (when enabled). Evicts // idle-then-warm when real headroom is short; rejects (and we back // off) when it cannot make room rather than risking the limit. @@ -285,7 +297,11 @@ impl ActiveWorkers { continue; } - // Estimate-semaphore pool: cheap pre-check + atomic commit. + // Estimate-semaphore pool: the second line of defence behind the + // gate. Its atomic permit acquisition catches the concurrent + // admissions the lockless gate can let through on the same snapshot. + // Sized above the gate ceiling (but clamped below the limit), so it + // rarely binds first — the gate refuses in normal operation. if let Some(permit) = acquire_pool_permit( &self.worker_memory, &self.workers, diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index 5a95e0056f..9a53176160 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -966,10 +966,21 @@ pub struct MemoryConfig { /// Multiplier applied to a component's `component_size`, charged once per /// resident component (shared across all its workers) rather than per worker. pub component_size_coefficient: f64, - /// Bytes of measured headroom kept free below the usable ceiling as a margin - /// against concurrent admissions overshooting before becoming resident. Used - /// by the measured-headroom admission gate. - pub admission_reserve_bytes: u64, + /// Multiplier (typically > 1.0) applied to the measured limit when sizing the + /// estimate semaphore. The estimate per worker is normally larger than its + /// real resident usage, so the semaphore is allowed to authorize more + /// estimated bytes than the limit: it is the second line of defence behind + /// the measured-headroom gate, catching the concurrent-admission race the + /// (lockless) gate cannot, while the gate refuses first in normal operation + /// against real usage. Always clamped by `worker_memory_max_safe_ratio` so it + /// can never itself authorise real usage past a safe fraction of the limit. + pub worker_memory_overcommit_ratio: f64, + /// Hard upper bound (fraction of the measured limit, < 1.0) on the estimate + /// semaphore size, regardless of `worker_memory_overcommit_ratio`. Keeps the + /// semaphore below the true limit so headroom always remains for the wasmtime + /// host even if the semaphore is the binding guard and estimates happen to + /// match real usage. + pub worker_memory_max_safe_ratio: f64, /// Whether the measured-headroom admission gate is active. Requires the /// executor to own its memory environment (its own cgroup/process), as in a /// production pod. Disable in shared environments — such as the in-process @@ -983,12 +994,14 @@ pub struct MemoryConfig { } impl MemoryConfig { + /// The memory limit this executor must stay under, resolved through the same + /// probe the admission gate uses: the cgroup `memory.max` of the pod on a + /// constrained Linux deployment, the configured override when set, and host + /// RAM only when the process is genuinely unconstrained. In a container this + /// is the pod's ceiling, not the host's total RAM. pub fn total_system_memory(&self) -> u64 { - self.system_memory_override.unwrap_or_else(|| { - let mut sysinfo = sysinfo::System::new(); - sysinfo.refresh_memory(); - sysinfo.total_memory() - }) + crate::services::active_workers::memory_probe::default_probe(self.system_memory_override) + .limit_bytes() } pub fn system_memory(&self) -> u64 { @@ -997,18 +1010,30 @@ impl MemoryConfig { sysinfo.available_memory() } + /// Size of the estimate semaphore: the measured limit scaled by the + /// overcommit ratio, then clamped to `worker_memory_max_safe_ratio` of the + /// limit. The overcommit lets the semaphore sit slightly above the gate + /// ceiling as a second line of defence (per-worker estimates exceed real + /// usage, so it rarely binds first); the clamp guarantees it can never be + /// sized to authorise real usage past a safe fraction of the limit, leaving + /// headroom for the wasmtime host. + pub fn worker_memory_for_limit(&self, limit_bytes: u64) -> usize { + let limit = limit_bytes as f64; + let overcommit = limit * self.worker_memory_overcommit_ratio; + let safe_cap = limit * self.worker_memory_max_safe_ratio; + overcommit.min(safe_cap) as usize + } + pub fn worker_memory(&self) -> usize { - (self.total_system_memory() as f64 * self.worker_memory_ratio) as usize + self.worker_memory_for_limit(self.total_system_memory()) } /// The admission policy for the measured-headroom gate. Reuses /// `worker_memory_ratio` as the usable fraction of the measured limit (the - /// host keeps the remainder) and `admission_reserve_bytes` as the concurrent - /// overshoot margin. + /// host keeps the remainder). pub fn admission_policy(&self) -> crate::services::active_workers::admission::AdmissionPolicy { crate::services::active_workers::admission::AdmissionPolicy { usable_ratio: self.worker_memory_ratio, - reserve_bytes: self.admission_reserve_bytes, } } } @@ -1036,8 +1061,13 @@ impl SafeDisplay for MemoryConfig { ); let _ = writeln!( &mut result, - "admission reserve bytes: {}", - self.admission_reserve_bytes + "worker memory overcommit ratio: {}", + self.worker_memory_overcommit_ratio + ); + let _ = writeln!( + &mut result, + "worker memory max safe ratio: {}", + self.worker_memory_max_safe_ratio ); let _ = writeln!( &mut result, @@ -1569,7 +1599,8 @@ impl Default for MemoryConfig { worker_memory_ratio: 0.8, worker_estimate_coefficient: 1.1, component_size_coefficient: 2.0, - admission_reserve_bytes: 256 * 1024 * 1024, + worker_memory_overcommit_ratio: 1.2, + worker_memory_max_safe_ratio: 0.9, enable_measured_admission: true, acquire_retry_delay: Duration::from_millis(500), oom_retry_config: RetryConfig { From 27119b28eec1e0bd3db73438edcb6edba021ff5d Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Tue, 9 Jun 2026 23:12:01 -0700 Subject: [PATCH 33/60] feat: run only initial echo test to make sure we did not make it slower --- .../benchmark_suites/cloud-perf.yaml | 172 +++++++++--------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index d508c5b5fc..21ef48352a 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -31,100 +31,100 @@ benchmarks: - name: throughput-echo iterations: 3 clusterSize: [2] - size: [1, 50, 100, 250] + size: [1, 10, 50, 100, 250] length: [1000] - # size = number of workers per implementation - # length = payload size in bytes sent to large_input - # NOTE: large payloads grow worker linear memory, so this is the throughput - # benchmark most relevant to the memory-admission investigation — sized to - # match throughput-echo so it exercises real density. - - name: throughput-large-input - iterations: 3 - clusterSize: [2] - size: [1, 50, 100, 250] - length: [100, 10000] + # # size = number of workers per implementation + # # length = payload size in bytes sent to large_input + # # NOTE: large payloads grow worker linear memory, so this is the throughput + # # benchmark most relevant to the memory-admission investigation — sized to + # # match throughput-echo so it exercises real density. + # - name: throughput-large-input + # iterations: 3 + # clusterSize: [2] + # size: [1, 50, 100, 250] + # length: [100, 10000] - # size = number of workers per implementation - # length = CPU work length passed to cpu_intensive - - name: throughput-cpu-intensive - iterations: 3 - clusterSize: [2] - size: [1, 50, 100, 250] - length: [100] + # # size = number of workers per implementation + # # length = CPU work length passed to cpu_intensive + # - name: throughput-cpu-intensive + # iterations: 3 + # clusterSize: [2] + # size: [1, 50, 100, 250] + # length: [100] - # Cold-start: compilation cache disabled — measures true cold-start latency - # with no warm compiled artefact available. - # size = number of unique components created (each in its own env) - # length = seconds to wait per component for pre-compilation warm-up - - name: cold-start-unknown-small - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 25, 50] - length: [2] - disableCompilationCache: true + # # Cold-start: compilation cache disabled — measures true cold-start latency + # # with no warm compiled artefact available. + # # size = number of unique components created (each in its own env) + # # length = seconds to wait per component for pre-compilation warm-up + # - name: cold-start-unknown-small + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 25, 50] + # length: [2] + # disableCompilationCache: true - - name: cold-start-unknown-medium - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 25, 50] - length: [5] - disableCompilationCache: true + # - name: cold-start-unknown-medium + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 25, 50] + # length: [5] + # disableCompilationCache: true - # Cold-start: compilation cache enabled — measures latency once the compiled - # artefact is available in the cache. - # size = number of unique components created (each in its own env) - # length = seconds to wait per component for pre-compilation warm-up - # NOTE: if results here are close to the cache-disabled entries above, the - # warm-up wait is too short and compilation hasn't finished — bump length. - - name: cold-start-unknown-small - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 25, 50] - length: [2] + # # Cold-start: compilation cache enabled — measures latency once the compiled + # # artefact is available in the cache. + # # size = number of unique components created (each in its own env) + # # length = seconds to wait per component for pre-compilation warm-up + # # NOTE: if results here are close to the cache-disabled entries above, the + # # warm-up wait is too short and compilation hasn't finished — bump length. + # - name: cold-start-unknown-small + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 25, 50] + # length: [2] - - name: cold-start-unknown-medium - iterations: 3 - clusterSize: [2] - size: [1, 5, 10, 25, 50] - length: [5] + # - name: cold-start-unknown-medium + # iterations: 3 + # clusterSize: [2] + # size: [1, 5, 10, 25, 50] + # length: [5] - # Invocation latency — hot and cold paths through the Gateway NLB. - # Large worker counts to stress the load balancer and connection pool. - # size = number of workers created - # length = number of hot invocations per worker after the first cold one - - name: latency-small - iterations: 3 - clusterSize: [2] - size: [100, 500, 1000, 2000, 5000] - length: [2] + # # Invocation latency — hot and cold paths through the Gateway NLB. + # # Large worker counts to stress the load balancer and connection pool. + # # size = number of workers created + # # length = number of hot invocations per worker after the first cold one + # - name: latency-small + # iterations: 3 + # clusterSize: [2] + # size: [100, 500, 1000, 2000, 5000] + # length: [2] - - name: latency-medium - iterations: 3 - clusterSize: [2] - size: [100, 500, 1000, 2000] - length: [5] + # - name: latency-medium + # iterations: 3 + # clusterSize: [2] + # size: [100, 500, 1000, 2000] + # length: [5] - # Sleep — measures worker suspension and resumption under real network - # conditions. High residency: all `size` workers held in memory sleeping at - # once, so this also probes how many resident workers fit (memory-admission - # relevant) — pushed past the ~2000 echo proved out. - # size = number of workers launched in parallel - # length = sleep duration in milliseconds - - name: sleep - iterations: 3 - clusterSize: [2] - size: [10, 100, 500, 1000, 2000] - length: [10000] + # # Sleep — measures worker suspension and resumption under real network + # # conditions. High residency: all `size` workers held in memory sleeping at + # # once, so this also probes how many resident workers fit (memory-admission + # # relevant) — pushed past the ~2000 echo proved out. + # # size = number of workers launched in parallel + # # length = sleep duration in milliseconds + # - name: sleep + # iterations: 3 + # clusterSize: [2] + # size: [10, 100, 500, 1000, 2000] + # length: [10000] - # Durability overhead — measures the cost of durable vs ephemeral execution - # across four variants (durable-persistent, durable-non-persistent, - # ephemeral, durable-persistent-commit). size workers concurrent per phase; - # sized up to put real load on the oplog/persistence/storage path. - # size = number of workers per variant - # length = loop iteration count passed to oplog_heavy - - name: durability-overhead - iterations: 3 - clusterSize: [2] - size: [10, 50, 100, 250] - length: [5000] + # # Durability overhead — measures the cost of durable vs ephemeral execution + # # across four variants (durable-persistent, durable-non-persistent, + # # ephemeral, durable-persistent-commit). size workers concurrent per phase; + # # sized up to put real load on the oplog/persistence/storage path. + # # size = number of workers per variant + # # length = loop iteration count passed to oplog_heavy + # - name: durability-overhead + # iterations: 3 + # clusterSize: [2] + # size: [10, 50, 100, 250] + # length: [5000] From b608593bad07c1049b645b6e0414de77fc618e3a Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Wed, 10 Jun 2026 01:48:34 -0700 Subject: [PATCH 34/60] feat: run only saturation test --- .../cloud-density-saturation.yaml | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml index 0b749bc743..3c2f6a6b1f 100644 --- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml +++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml @@ -28,28 +28,28 @@ name: cloud-density-saturation benchmarks: - # Rust echo agents — lean per-instance linear memory (the ~900 KB module is - # charged once per component, shared across all agents; what scales per agent - # is the small instance heap). The previous run reached the top of the sweep - # (12000) without saturating pod memory, so the knee here is throughput / - # eviction-churn rather than memory. Dropped the low points that told us - # nothing and pushed the range up with coarser steps. - - name: throughput-saturation-echo-rust - iterations: 3 - clusterSize: [2] - size: [4000, 8000, 16000, 24000, 32000] - length: [0] + # # Rust echo agents — lean per-instance linear memory (the ~900 KB module is + # # charged once per component, shared across all agents; what scales per agent + # # is the small instance heap). The previous run reached the top of the sweep + # # (12000) without saturating pod memory, so the knee here is throughput / + # # eviction-churn rather than memory. Dropped the low points that told us + # # nothing and pushed the range up with coarser steps. + # - name: throughput-saturation-echo-rust + # iterations: 3 + # clusterSize: [2] + # size: [4000, 8000, 16000, 24000, 32000] + # length: [0] - # TypeScript echo agents — each instance instantiates its own QuickJS runtime - # and JS heap in its own linear memory (the 17.4 MB module is shared once per - # component; the per-instance runtime state is the heavy per-agent cost). - # Heavier per agent than the Rust variant, so a lower knee — but the previous - # run reached 2000 without saturating, so push higher and drop the low points. - - name: throughput-saturation-echo-ts - iterations: 3 - clusterSize: [2] - size: [1000, 2000, 4000, 6000, 8000] - length: [0] + # # TypeScript echo agents — each instance instantiates its own QuickJS runtime + # # and JS heap in its own linear memory (the 17.4 MB module is shared once per + # # component; the per-instance runtime state is the heavy per-agent cost). + # # Heavier per agent than the Rust variant, so a lower knee — but the previous + # # run reached 2000 without saturating, so push higher and drop the low points. + # - name: throughput-saturation-echo-ts + # iterations: 3 + # clusterSize: [2] + # size: [1000, 2000, 4000, 6000, 8000] + # length: [0] # Synthetic footprint — each agent retains a deterministic per-agent-distinct # amount of resident memory, exercising the admission/eviction path with a @@ -62,7 +62,7 @@ benchmarks: # ~145 agents. The sweep brackets that ceiling and pushes well past it # so the admission gate's reject/evict behaviour near OOM is exercised. - name: throughput-saturation-counters - iterations: 3 + iterations: 1 clusterSize: [2] size: [50, 100, 150, 200, 300] length: [16777216] From 1f1b77a4a36db34b1e27becf185678208236316f Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Wed, 10 Jun 2026 02:22:28 -0700 Subject: [PATCH 35/60] feat: bigger saturation spread --- .../cloud-density-saturation.yaml | 70 +++++++++---------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml index 3c2f6a6b1f..bd9819524e 100644 --- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml +++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml @@ -28,41 +28,41 @@ name: cloud-density-saturation benchmarks: - # # Rust echo agents — lean per-instance linear memory (the ~900 KB module is - # # charged once per component, shared across all agents; what scales per agent - # # is the small instance heap). The previous run reached the top of the sweep - # # (12000) without saturating pod memory, so the knee here is throughput / - # # eviction-churn rather than memory. Dropped the low points that told us - # # nothing and pushed the range up with coarser steps. - # - name: throughput-saturation-echo-rust - # iterations: 3 - # clusterSize: [2] - # size: [4000, 8000, 16000, 24000, 32000] - # length: [0] + # Rust echo agents — lean per-instance linear memory (the ~900 KB module is + # charged once per component, shared across all agents; what scales per agent + # is the small instance heap). The previous run reached the top of the sweep + # (12000) without saturating pod memory, so the knee here is throughput / + # eviction-churn rather than memory. Dropped the low points that told us + # nothing and pushed the range up with coarser steps. + - name: throughput-saturation-echo-rust + iterations: 3 + clusterSize: [2] + size: [2000, 3000, 4000, 8000, 16000, 20000] + length: [0] - # # TypeScript echo agents — each instance instantiates its own QuickJS runtime - # # and JS heap in its own linear memory (the 17.4 MB module is shared once per - # # component; the per-instance runtime state is the heavy per-agent cost). - # # Heavier per agent than the Rust variant, so a lower knee — but the previous - # # run reached 2000 without saturating, so push higher and drop the low points. - # - name: throughput-saturation-echo-ts + # TypeScript echo agents — each instance instantiates its own QuickJS runtime + # and JS heap in its own linear memory (the 17.4 MB module is shared once per + # component; the per-instance runtime state is the heavy per-agent cost). + # Heavier per agent than the Rust variant, so a lower knee — but the previous + # run reached 2000 without saturating, so push higher and drop the low points. + - name: throughput-saturation-echo-ts + iterations: 3 + clusterSize: [2] + size: [1000, 2000, 4000] + length: [0] + + # # Synthetic footprint — each agent retains a deterministic per-agent-distinct + # # amount of resident memory, exercising the admission/eviction path with a + # # controllable footprint near the limit. Run first: this is the variant that + # # actually fills memory and drives the gate to its reject/evict path. + # # size = number of active, memory-holding agents (the ramp axis) + # # length = base per-agent memory footprint in bytes; each agent retains a + # # deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base => + # # ~72 MiB average per agent, filling a ~10 GiB usable pool around + # # ~145 agents. The sweep brackets that ceiling and pushes well past it + # # so the admission gate's reject/evict behaviour near OOM is exercised. + # - name: throughput-saturation-counters # iterations: 3 # clusterSize: [2] - # size: [1000, 2000, 4000, 6000, 8000] - # length: [0] - - # Synthetic footprint — each agent retains a deterministic per-agent-distinct - # amount of resident memory, exercising the admission/eviction path with a - # controllable footprint near the limit. Run first: this is the variant that - # actually fills memory and drives the gate to its reject/evict path. - # size = number of active, memory-holding agents (the ramp axis) - # length = base per-agent memory footprint in bytes; each agent retains a - # deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base => - # ~72 MiB average per agent, filling a ~10 GiB usable pool around - # ~145 agents. The sweep brackets that ceiling and pushes well past it - # so the admission gate's reject/evict behaviour near OOM is exercised. - - name: throughput-saturation-counters - iterations: 1 - clusterSize: [2] - size: [50, 100, 150, 200, 300] - length: [16777216] + # size: [50, 100, 150, 200, 300] + # length: [16777216] From 1bd27ea6fb36f5ba0752c6cf893d49b32b356a4c Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Wed, 10 Jun 2026 02:24:19 -0700 Subject: [PATCH 36/60] feat(benchmark): change the steps --- .../benchmark_suites/cloud-density-saturation.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml index bd9819524e..78b0064fa2 100644 --- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml +++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml @@ -37,7 +37,7 @@ benchmarks: - name: throughput-saturation-echo-rust iterations: 3 clusterSize: [2] - size: [2000, 3000, 4000, 8000, 16000, 20000] + size: [2000, 3000, 4000, 5000, 10000, 15000, 20000] length: [0] # TypeScript echo agents — each instance instantiates its own QuickJS runtime @@ -48,7 +48,7 @@ benchmarks: - name: throughput-saturation-echo-ts iterations: 3 clusterSize: [2] - size: [1000, 2000, 4000] + size: [1000, 2000, 3000] length: [0] # # Synthetic footprint — each agent retains a deterministic per-agent-distinct From 898435df30497942fc0b0875a3ae9d09aa3d05b5 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Wed, 10 Jun 2026 16:14:33 -0700 Subject: [PATCH 37/60] feat: replace estimate-semaphore completely with measured-headroom admission controller --- .../config/debug-worker-executor.sample.env | 4 - .../config/debug-worker-executor.toml | 4 - golem-debugging-service/src/lib.rs | 3 +- .../config/worker-executor.sample.env | 6 - .../config/worker-executor.toml | 6 - .../active_workers/admission/tests.txt | 9 + golem-worker-executor/src/lib.rs | 3 +- golem-worker-executor/src/metrics.rs | 86 ++-- .../services/active_workers/admission/mod.rs | 115 ++++- .../active_workers/admission/tests.rs | 440 +++++++++++++++++- .../src/services/active_workers/mod.rs | 354 +++++--------- .../src/services/active_workers/tests.rs | 111 +++++ .../src/services/golem_config.rs | 53 +-- golem-worker-executor/src/worker/mod.rs | 94 ++-- 14 files changed, 843 insertions(+), 445 deletions(-) create mode 100644 golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env index 3c87d1275c..4349e54ebe 100644 --- a/golem-debugging-service/config/debug-worker-executor.sample.env +++ b/golem-debugging-service/config/debug-worker-executor.sample.env @@ -59,8 +59,6 @@ GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 -GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9 -GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s" @@ -236,8 +234,6 @@ GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 -GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9 -GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s" diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml index 82e5dbdc92..01a81fd83a 100644 --- a/golem-debugging-service/config/debug-worker-executor.toml +++ b/golem-debugging-service/config/debug-worker-executor.toml @@ -99,8 +99,6 @@ acquire_retry_delay = "500ms" component_size_coefficient = 2.0 enable_measured_admission = true worker_estimate_coefficient = 1.1 -worker_memory_max_safe_ratio = 0.9 -worker_memory_overcommit_ratio = 1.2 worker_memory_ratio = 0.8 [memory.oom_retry_config] @@ -371,8 +369,6 @@ without_time = false # component_size_coefficient = 2.0 # enable_measured_admission = true # worker_estimate_coefficient = 1.1 -# worker_memory_max_safe_ratio = 0.9 -# worker_memory_overcommit_ratio = 1.2 # worker_memory_ratio = 0.8 # # [memory.oom_retry_config] diff --git a/golem-debugging-service/src/lib.rs b/golem-debugging-service/src/lib.rs index d6062f2cf1..71e3aeac9c 100644 --- a/golem-debugging-service/src/lib.rs +++ b/golem-debugging-service/src/lib.rs @@ -377,7 +377,8 @@ pub async fn run_debug_worker_executor + ?Sized + Sen let total_system_memory = golem_config.memory.total_system_memory(); let system_memory = golem_config.memory.system_memory(); - let worker_memory = golem_config.memory.worker_memory(); + let worker_memory = + (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64; info!( "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}", ISizeFormatter::new(total_system_memory, humansize::BINARY), diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env index 4cd9a25b12..d3c7a04559 100644 --- a/golem-worker-executor/config/worker-executor.sample.env +++ b/golem-worker-executor/config/worker-executor.sample.env @@ -76,8 +76,6 @@ GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 -GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9 -GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s" @@ -299,8 +297,6 @@ GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 -GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9 -GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s" @@ -492,8 +488,6 @@ GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 -GOLEM__MEMORY__WORKER_MEMORY_MAX_SAFE_RATIO=0.9 -GOLEM__MEMORY__WORKER_MEMORY_OVERCOMMIT_RATIO=1.2 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_ATTEMPTS=4294967295 GOLEM__MEMORY__OOM_RETRY_CONFIG__MAX_DELAY="5s" diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml index f5a8cd0183..e77c5f9bfa 100644 --- a/golem-worker-executor/config/worker-executor.toml +++ b/golem-worker-executor/config/worker-executor.toml @@ -128,8 +128,6 @@ acquire_retry_delay = "500ms" component_size_coefficient = 2.0 enable_measured_admission = true worker_estimate_coefficient = 1.1 -worker_memory_max_safe_ratio = 0.9 -worker_memory_overcommit_ratio = 1.2 worker_memory_ratio = 0.8 [memory.oom_retry_config] @@ -463,8 +461,6 @@ without_time = false # component_size_coefficient = 2.0 # enable_measured_admission = true # worker_estimate_coefficient = 1.1 -# worker_memory_max_safe_ratio = 0.9 -# worker_memory_overcommit_ratio = 1.2 # worker_memory_ratio = 0.8 # # [memory.oom_retry_config] @@ -768,8 +764,6 @@ without_time = false # component_size_coefficient = 2.0 # enable_measured_admission = true # worker_estimate_coefficient = 1.1 -# worker_memory_max_safe_ratio = 0.9 -# worker_memory_overcommit_ratio = 1.2 # worker_memory_ratio = 0.8 # # [memory.oom_retry_config] diff --git a/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt b/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt new file mode 100644 index 0000000000..eb12d21790 --- /dev/null +++ b/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt @@ -0,0 +1,9 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc b49eb145c9dca28d347382d8e482bb2cb6c5d256ccaba7532b370fbadc2bb3fb # shrinks to (limit, residents) = (500, []), schedule = [Admit(220), Admit(92), Admit(189)] +cc 9727f7e7aab54f8f48e6b856f9d70428fd8503767677fa7c232e27263273e071 # shrinks to limit = 815, schedule = [Grant(485), Grant(1), Grant(7), Exit(1), Grant(1), FaultIn(2, 1), Grant(40), Exit(2), Grant(284)] +cc 41321d47abd75b283d651e63e40c0f5191b680b908c05879c02d5f36b70de66c # shrinks to (limit, residents) = (1369, [Resident { size: 144, priority: Idle }, Resident { size: 228, priority: Warm }, Resident { size: 152, priority: Warm }, Resident { size: 101, priority: Idle }, Resident { size: 68, priority: Warm }, Resident { size: 45, priority: Idle }, Resident { size: 30, priority: Idle }, Resident { size: 20, priority: Idle }, Resident { size: 13, priority: Warm }, Resident { size: 9, priority: Idle }, Resident { size: 6, priority: Idle }]), schedule = [Admit(270), Admit(785), Admit(250), Admit(146), Admit(456)] diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs index a62f944cf0..1eedc9f5e1 100644 --- a/golem-worker-executor/src/lib.rs +++ b/golem-worker-executor/src/lib.rs @@ -1002,7 +1002,8 @@ pub async fn bootstrap_and_run_worker_executor< let total_system_memory = golem_config.memory.total_system_memory(); let system_memory = golem_config.memory.system_memory(); - let worker_memory = golem_config.memory.worker_memory(); + let worker_memory = + (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64; info!( "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}", ISizeFormatter::new(total_system_memory, BINARY), diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs index de6d673632..b611f9985b 100644 --- a/golem-worker-executor/src/metrics.rs +++ b/golem-worker-executor/src/metrics.rs @@ -190,6 +190,46 @@ pub mod workers { crate::metrics::BLOB_SIZE_BUCKETS.to_vec() ) .unwrap(); + pub static ref WORKER_MEMORY_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!( + "golem_worker_memory_pool_total_bytes", + "Usable memory ceiling (usable_ratio * measured limit) the admission gate admits against on this executor", + &["executor_id"] + ) + .unwrap(); + pub static ref WORKER_MEMORY_POOL_USED_BYTES: GaugeVec = register_gauge_vec!( + "golem_worker_memory_pool_used_bytes", + "Total linear memory granted to live workers and reserved by the admission gate on this executor", + &["executor_id"] + ) + .unwrap(); + pub static ref WORKER_ADMISSION_RSS_BYTES: GaugeVec = register_gauge_vec!( + "golem_worker_admission_rss_bytes", + "Measured resident memory (probe snapshot) the admission gate last read on this executor", + &["executor_id"] + ) + .unwrap(); + } + + /// Sets the gate's usable memory ceiling gauge. + pub fn record_worker_memory_ceiling(bytes: u64) { + WORKER_MEMORY_POOL_TOTAL_BYTES + .with_label_values(&[crate::metrics::storage::executor_id()]) + .set(bytes as f64); + } + + /// Sets the gauge of total memory granted to live workers (the gate's + /// reservation). + pub fn record_worker_memory_granted(bytes: u64) { + WORKER_MEMORY_POOL_USED_BYTES + .with_label_values(&[crate::metrics::storage::executor_id()]) + .set(bytes as f64); + } + + /// Sets the gauge of measured resident memory last read by the gate. + pub fn record_worker_admission_rss(bytes: u64) { + WORKER_ADMISSION_RSS_BYTES + .with_label_values(&[crate::metrics::storage::executor_id()]) + .set(bytes as f64); } pub fn record_worker_call(api_name: &'static str) { @@ -302,18 +342,6 @@ pub mod workers { WORKER_FILESYSTEM_SEMAPHORE_AVAILABLE.add(permits.into_f64()); } - /// Records acquisition of `bytes` from the worker-memory pool. - /// Updates `golem_worker_memory_pool_used_bytes{executor_id}`. - pub fn record_memory_permit_acquired(bytes: usize) { - crate::metrics::storage::record_worker_memory_pool_acquired(bytes as u64); - } - - /// Records release of `bytes` back to the worker-memory pool. - /// Updates `golem_worker_memory_pool_used_bytes{executor_id}`. - pub fn record_memory_permit_released(bytes: usize) { - crate::metrics::storage::record_worker_memory_pool_released(bytes as u64); - } - pub fn record_worker_kv_cache_value_size(bytes: usize) { WORKER_KV_CACHE_VALUE_SIZE_BYTES .with_label_values(&[crate::metrics::storage::executor_id()]) @@ -512,13 +540,13 @@ pub mod wasm { .unwrap(); static ref ALLOCATED_MEMORY_BYTES: Histogram = register_histogram!( "allocated_memory_bytes", - "Amount of memory allocated by a single memory.grow instruction", + "Worker's total linear memory size after a memory.grow, sampled at each grow", crate::metrics::MEMORY_SIZE_BUCKETS.to_vec() ) .unwrap(); static ref WORKER_RESIDENT_LINEAR_MEMORY_BYTES: Histogram = register_histogram!( "worker_resident_linear_memory_bytes", - "Per-worker cumulative linear-memory ceiling (total_linear_memory_size = sum of memory.grow deltas) sampled at permit acquire. This is the semaphore charge basis (x*ml), an upper bound on resident RSS, NOT measured resident memory (grown pages are largely demand-paged); compare to container_memory_working_set_bytes for the gap", + "Per-worker cumulative linear-memory grant (total_linear_memory_size = sum of memory.grow deltas) sampled when the worker is admitted. This is the linear memory the admission gate reserves for the worker; it is an upper bound on resident RSS, not measured resident memory, since grown pages are largely demand-paged. Compare to container_memory_working_set_bytes for the gap.", crate::metrics::MEMORY_SIZE_BUCKETS.to_vec() ) .unwrap(); @@ -759,18 +787,6 @@ pub mod storage { &["executor_id"] ) .unwrap(); - pub static ref WORKER_MEMORY_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!( - "golem_worker_memory_pool_total_bytes", - "Configured worker-memory semaphore size in bytes for this executor", - &["executor_id"] - ) - .unwrap(); - pub static ref WORKER_MEMORY_POOL_USED_BYTES: GaugeVec = register_gauge_vec!( - "golem_worker_memory_pool_used_bytes", - "Bytes currently acquired from the worker-memory semaphore on this executor", - &["executor_id"] - ) - .unwrap(); } pub fn record_filesystem_pool_total(bytes: u64) { @@ -790,22 +806,4 @@ pub mod storage { .with_label_values(&[executor_id()]) .sub(bytes as f64); } - - pub fn record_worker_memory_pool_total(bytes: u64) { - WORKER_MEMORY_POOL_TOTAL_BYTES - .with_label_values(&[executor_id()]) - .set(bytes as f64); - } - - pub fn record_worker_memory_pool_acquired(bytes: u64) { - WORKER_MEMORY_POOL_USED_BYTES - .with_label_values(&[executor_id()]) - .add(bytes as f64); - } - - pub fn record_worker_memory_pool_released(bytes: u64) { - WORKER_MEMORY_POOL_USED_BYTES - .with_label_values(&[executor_id()]) - .sub(bytes as f64); - } } diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs index d89f710859..0008f66773 100644 --- a/golem-worker-executor/src/services/active_workers/admission/mod.rs +++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs @@ -14,15 +14,36 @@ //! Measured-headroom admission decision. //! -//! Gates worker admission on the executor environment's *real* memory headroom -//! read from the [`MemoryProbe`], rather than on the estimate-based semaphore in -//! [`super::ActiveWorkers`]. This controller is the primary, authoritative -//! check against measured resident usage and refuses admission in normal -//! operation; the estimate semaphore is the second line of defence behind it, -//! its atomic permit acquisition catching the concurrent admissions this -//! (lockless) controller can let through on the same snapshot. When headroom is -//! short it evicts already-resident idle-then-warm work; if it still cannot make -//! room it rejects rather than over-committing. +//! Gates worker admission on the executor environment's memory headroom. It is +//! the sole admission authority: there is no estimate-based semaphore behind it. +//! +//! The gate weighs two quantities against the usable ceiling: +//! +//! * Measured RSS from the [`MemoryProbe`] (cgroup `memory.current` on a +//! constrained pod) — what is resident right now. +//! * The total linear memory *granted* to live workers — what they could fault +//! in at any moment. +//! +//! Both matter because they fail in opposite directions. Measured RSS lags +//! admission: `memory.current` counts only touched pages, so a worker admitted +//! moments ago is not yet resident and a burst admitted against the same low +//! snapshot would collectively over-commit. The granted total leads residency: a +//! worker can fault in any page of the virtual memory it was already granted at +//! any later time, with no admission call to intercept it, so a gate that +//! reserved only what is resident would let a node full of lightly-touched +//! workers OOM by writing into memory they already hold. The gate therefore +//! reserves the full granted total from admission until unload, and admits +//! against the *larger* of measured RSS and that granted total — safe against +//! both the burst race and later faulting of granted pages. +//! +//! The granted total is maintained by two integer updates: a worker's grant is +//! added on admission and removed on unload (via [`AdmissionController::release`] +//! from the worker lifecycle). The headroom check re-derives the reservation +//! from this maintained total and the current probe reading, so it is O(1) and +//! exact regardless of worker churn. +//! +//! When headroom is short the controller evicts already-resident idle-then-warm +//! work; if it still cannot make room it rejects rather than over-committing. //! //! The controller is decoupled from `Worker`/wasmtime via the [`EvictionSource`] //! trait so its decision logic can be exercised in isolation with synthetic @@ -30,6 +51,7 @@ use super::memory_probe::MemoryProbe; use async_trait::async_trait; +use std::sync::Mutex; /// Why an eviction candidate is worth evicting, in priority order. Lower /// variants are evicted first. @@ -80,25 +102,81 @@ pub struct AdmissionPolicy { } /// Decides admission against measured headroom, evicting resident idle/warm -/// work as needed. Holds only its policy and probe; live state is read fresh -/// from the probe and the eviction source on each call (never cached). +/// work as needed. Holds its policy and probe; live usage is read fresh from the +/// probe on each call. The only retained state is `granted`: the total linear +/// memory granted to live workers, maintained across admit and unload, which the +/// gate reserves so a worker cannot OOM the node by faulting in granted pages. pub struct AdmissionController { probe: Box, policy: AdmissionPolicy, + granted: Mutex, } impl AdmissionController { pub fn new(probe: Box, policy: AdmissionPolicy) -> Self { - Self { probe, policy } + let ceiling = (probe.snapshot().limit_bytes as f64 * policy.usable_ratio) as u64; + crate::metrics::workers::record_worker_memory_ceiling(ceiling); + Self { + probe, + policy, + granted: Mutex::new(0), + } } - /// Bytes available for new admissions: the carve-out ceiling - /// (`usable_ratio × limit`) minus current usage. Saturating — never - /// underflows when already over the ceiling. + /// Bytes available for a new admission: the usable ceiling minus the larger + /// of measured RSS and the total memory granted to live workers. Saturating — + /// never underflows when already over the ceiling. + /// + /// A worker can fault in any page of the virtual memory it was granted at any + /// time, with no admission call to intercept it, so the gate must reserve the + /// full granted total even before it is resident. Measured RSS is only larger + /// than the granted total transiently (host/runtime overhead the grant does + /// not cover), so taking the maximum keeps the gate safe against both the + /// grant a worker may yet fault in and any usage the grant does not capture. fn admissible_headroom(&self) -> u64 { let snapshot = self.probe.snapshot(); let ceiling = (snapshot.limit_bytes as f64 * self.policy.usable_ratio) as u64; - ceiling.saturating_sub(snapshot.current_bytes) + let granted = *self.granted.lock().unwrap(); + crate::metrics::workers::record_worker_memory_ceiling(ceiling); + crate::metrics::workers::record_worker_admission_rss(snapshot.current_bytes); + ceiling.saturating_sub(snapshot.current_bytes.max(granted)) + } + + /// Record `request_bytes` of memory granted to a newly admitted worker. The + /// gate reserves this until the worker unloads, because the worker may fault + /// the granted pages in at any later time. + fn reserve(&self, request_bytes: u64) { + let mut granted = self.granted.lock().unwrap(); + *granted += request_bytes; + crate::metrics::workers::record_worker_memory_granted(*granted); + } + + /// Reserve memory for a cost that is a committed consequence of an already + /// admitted worker rather than a fresh admission — currently a component's + /// compiled module, loaded into RAM when the first worker of the component + /// becomes resident and shared by all its workers. Unlike admission this does + /// not evict or reject (the worker is already in); it accounts the bytes so + /// later admissions see them. Released with [`Self::release`]. + pub fn reserve_committed(&self, bytes: u64) { + self.reserve(bytes); + } + + /// Release the grant of a worker that has unloaded, given the bytes it was + /// granted. Its pages leave memory, so its grant no longer needs reserving; + /// not releasing it would permanently shrink admissible headroom as workers + /// come and go. + pub fn release(&self, reserved_bytes: u64) { + let mut granted = self.granted.lock().unwrap(); + *granted = granted.saturating_sub(reserved_bytes); + crate::metrics::workers::record_worker_memory_granted(*granted); + } + + /// Pre-register grant bytes for workers that were already live when the + /// controller was created. Test-only: production registers every worker's + /// grant through admission. + #[cfg(test)] + pub fn seed_granted(&self, bytes: u64) { + *self.granted.lock().unwrap() += bytes; } /// Decide whether `request_bytes` can be admitted, evicting from `source` if @@ -107,7 +185,8 @@ impl AdmissionController { /// Eviction is attempted idle-first, then warm, and only up to the shortfall /// (never evicts when headroom already suffices). After eviction the /// headroom is re-measured against ground truth; the request is admitted only - /// if the real headroom now covers it, otherwise it is rejected. + /// if the real headroom now covers it, otherwise it is rejected. On admit the + /// request is added to the in-flight reservation. pub async fn try_admit( &self, request_bytes: u64, @@ -115,6 +194,7 @@ impl AdmissionController { ) -> AdmissionDecision { // Fast path: enough real headroom already, admit without evicting. if self.admissible_headroom() >= request_bytes { + self.reserve(request_bytes); return AdmissionDecision::Admit; } @@ -134,6 +214,7 @@ impl AdmissionController { // the probe is the authority, and other activity may have moved usage // in either direction while we were evicting. if self.admissible_headroom() >= request_bytes { + self.reserve(request_bytes); AdmissionDecision::Admit } else { AdmissionDecision::Reject diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs index 4996b97be7..24e9b3e119 100644 --- a/golem-worker-executor/src/services/active_workers/admission/tests.rs +++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs @@ -47,6 +47,20 @@ struct Resident { priority: EvictionPriority, } +/// An admitted request whose pages have not yet fully faulted into RSS. +/// +/// Models the gap between admission and residency: the worker has been admitted +/// for `reserved` bytes but only `resident` of them have actually touched memory +/// so far. Real RSS (what the probe reads) reflects only `resident`; the +/// remaining `reserved - resident` bytes are still in flight and will appear in +/// RSS later. This lag is what lets concurrent admissions on the same RSS +/// snapshot collectively over-commit. +#[derive(Debug, Clone, Copy)] +struct InFlight { + reserved: u64, + resident: u64, +} + /// Shared model of the executor environment's memory. #[derive(Debug, Default)] struct EnvState { @@ -56,6 +70,10 @@ struct EnvState { pinned_usage: u64, /// Resident, evictable work — what the controller may reclaim. residents: Vec, + /// Admitted requests whose pages are still faulting in. Their `resident` + /// portion counts toward measured RSS now; their full `reserved` size is + /// what RSS will reach once they are fully resident. + in_flight: Vec, /// Count of evictions performed, for the no-spurious-eviction property. evictions: usize, /// The priorities evicted, in order, for the ordering property. @@ -63,8 +81,64 @@ struct EnvState { } impl EnvState { + /// Measured RSS: the bytes that have actually faulted in. Lags behind what + /// has been admitted, because in-flight requests are only partially + /// resident. This is what the probe reports. fn usage(&self) -> u64 { - self.pinned_usage + self.residents.iter().map(|r| r.size).sum::() + self.pinned_usage + + self.residents.iter().map(|r| r.size).sum::() + + self.in_flight.iter().map(|f| f.resident).sum::() + } + + /// Total bytes that admitted work will eventually occupy once every + /// in-flight request has fully faulted in. The safety property is stated + /// against this value: reserved bytes always become resident, so if this + /// can exceed the limit the environment will OOM once the lag resolves. + fn eventual_usage(&self) -> u64 { + self.pinned_usage + + self.residents.iter().map(|r| r.size).sum::() + + self.in_flight.iter().map(|f| f.reserved).sum::() + } + + /// Advance residency: each in-flight request faults in up to `step` more of + /// its reserved bytes, raising measured RSS toward its eventual size. + /// Fully-resident requests are retired into `pinned_usage`. + fn tick_residency(&mut self, step: u64) { + for f in &mut self.in_flight { + let remaining = f.reserved - f.resident; + f.resident += remaining.min(step); + } + let (done, pending): (Vec<_>, Vec<_>) = self + .in_flight + .drain(..) + .partition(|f| f.resident >= f.reserved); + self.pinned_usage += done.iter().map(|f| f.reserved).sum::(); + self.in_flight = pending; + } + + /// Fault in `step` bytes of granted-but-untouched memory belonging to the + /// in-flight request at `index`, without faulting in any other request. A + /// worker may touch the virtual memory it was already granted at any later + /// time, with no admission call in the loop, so this raises measured RSS for + /// one worker in isolation. + fn fault_in_one(&mut self, index: usize, step: u64) { + if let Some(f) = self.in_flight.get_mut(index) { + let remaining = f.reserved - f.resident; + f.resident += remaining.min(step); + } + } + + /// Remove the in-flight worker at `index`: it finishes and unloads, freeing + /// both its resident pages and its remaining grant. Measured RSS drops by its + /// resident portion. Returns the bytes it was admitted for, so the caller can + /// release the gate's reservation for it. The surviving workers' reservations + /// for their own untouched grants must not be credited by this drop. + fn exit_one(&mut self, index: usize) -> Option { + if index < self.in_flight.len() { + Some(self.in_flight.remove(index).reserved) + } else { + None + } } } @@ -85,6 +159,9 @@ impl MemoryProbe for FakeProbe { struct FakeEvictionSource { state: Arc>, + /// The gate, so eviction can release each evicted resident's grant — in + /// production, eviction unloads the worker, which releases its grant. + controller: Arc, } #[async_trait::async_trait] @@ -99,6 +176,7 @@ impl EvictionSource for FakeEvictionSource { if state.residents[i].priority == priority { let victim = state.residents.remove(i); freed += victim.size; + self.controller.release(victim.size); state.evictions += 1; state.eviction_order.push(priority); } else { @@ -109,17 +187,35 @@ impl EvictionSource for FakeEvictionSource { } } -fn controller(state: Arc>) -> AdmissionController { +fn controller(state: Arc>) -> Arc { controller_with_ratio(state, 1.0) } -fn controller_with_ratio(state: Arc>, usable_ratio: f64) -> AdmissionController { - AdmissionController::new( +fn controller_with_ratio( + state: Arc>, + usable_ratio: f64, +) -> Arc { + // Workers already resident when the gate is created had their grants + // registered at their own admission; seed the gate to match. + let initial_granted = { + let s = state.lock().unwrap(); + s.pinned_usage + s.residents.iter().map(|r| r.size).sum::() + }; + let controller = AdmissionController::new( Box::new(FakeProbe { state: state.clone(), }), AdmissionPolicy { usable_ratio }, - ) + ); + controller.seed_granted(initial_granted); + Arc::new(controller) +} + +fn eviction_source( + state: Arc>, + controller: Arc, +) -> FakeEvictionSource { + FakeEvictionSource { state, controller } } /// Apply one admission attempt against the model, mutating `usage` on admit. @@ -136,6 +232,28 @@ async fn apply_admit( decision } +/// Apply one admission attempt where admitted bytes do NOT become resident +/// immediately. On admit the request is recorded as in-flight with zero resident +/// bytes, so measured RSS is unchanged until a later residency tick faults its +/// pages in. This models the real lag between admission and RSS, the window in +/// which concurrent admissions on the same snapshot can collectively +/// over-commit. +async fn apply_staggered_admit( + controller: &AdmissionController, + source: &FakeEvictionSource, + state: &Arc>, + request: u64, +) -> AdmissionDecision { + let decision = controller.try_admit(request, source).await; + if decision == AdmissionDecision::Admit { + state.lock().unwrap().in_flight.push(InFlight { + reserved: request, + resident: 0, + }); + } + decision +} + // ── Single-case unit tests ─────────────────────────────────────────────────── #[test] @@ -150,9 +268,7 @@ async fn admits_when_headroom_is_ample_without_evicting() { ..Default::default() })); let ctrl = controller(state.clone()); - let source = FakeEvictionSource { - state: state.clone(), - }; + let source = eviction_source(state.clone(), ctrl.clone()); let decision = apply_admit(&ctrl, &source, &state, 200).await; assert_eq!(decision, AdmissionDecision::Admit); @@ -180,9 +296,7 @@ async fn evicts_idle_before_warm() { // usage = 800, limit = 1000, headroom = 200. Request 300 → shortfall 100. // One idle (400) covers it; warm must remain untouched. let ctrl = controller(state.clone()); - let source = FakeEvictionSource { - state: state.clone(), - }; + let source = eviction_source(state.clone(), ctrl.clone()); let decision = apply_admit(&ctrl, &source, &state, 300).await; assert_eq!(decision, AdmissionDecision::Admit); @@ -202,9 +316,7 @@ async fn rejects_when_nothing_can_be_freed() { ..Default::default() })); let ctrl = controller(state.clone()); - let source = FakeEvictionSource { - state: state.clone(), - }; + let source = eviction_source(state.clone(), ctrl.clone()); let decision = apply_admit(&ctrl, &source, &state, 200).await; assert_eq!(decision, AdmissionDecision::Reject); @@ -219,6 +331,17 @@ enum Op { Admit(u64), } +/// An operation in a staggered-start schedule. Unlike [`Op`], admitted bytes do +/// not become resident immediately — `Tick` advances residency separately, so +/// the schedule can interleave admissions and page-faulting in any order. +#[derive(Debug, Clone)] +enum StaggeredOp { + /// Attempt to admit a worker reserving this many bytes. + Admit(u64), + /// Fault in up to this many more bytes of every in-flight worker. + Tick(u64), +} + fn arb_resident_priority() -> impl Strategy { prop_oneof![Just(EvictionPriority::Idle), Just(EvictionPriority::Warm)] } @@ -279,7 +402,7 @@ proptest! { ..Default::default() })); let ctrl = controller(state.clone()); - let source = FakeEvictionSource { state: state.clone() }; + let source = eviction_source(state.clone(), ctrl.clone()); for op in ops { match op { @@ -318,7 +441,7 @@ proptest! { ..Default::default() })); let ctrl = controller(state.clone()); - let source = FakeEvictionSource { state: state.clone() }; + let source = eviction_source(state.clone(), ctrl.clone()); for op in ops { match op { @@ -349,7 +472,7 @@ proptest! { ..Default::default() })); let ctrl = controller(state.clone()); - let source = FakeEvictionSource { state: state.clone() }; + let source = eviction_source(state.clone(), ctrl.clone()); for op in ops { match op { @@ -375,6 +498,285 @@ proptest! { } } +// ── Staggered-start safety ─────────────────────────────────────────────────── + +/// A schedule of admissions interleaved with residency ticks. Admissions +/// reserve bytes that only become resident when a later `Tick` faults them in, +/// so the schedule exercises the lag between admission and measured RSS in which +/// concurrent admissions can collectively over-commit. Skewed toward `Admit` so +/// bursts of admissions land between ticks (the dangerous case). +fn arb_staggered_schedule() -> impl Strategy> { + prop::collection::vec( + prop_oneof![ + 3 => (1u64..800).prop_map(StaggeredOp::Admit), + 1 => (1u64..800).prop_map(StaggeredOp::Tick), + ], + 0..60, + ) +} + +proptest! { + /// Safety invariant under staggered starts: for any interleaving of + /// admissions and residency ticks, once every admitted worker has fully + /// faulted its pages in, resident usage must not exceed the limit. + /// + /// Reserved bytes always eventually become resident, so the check is made + /// against the state after a final full-residency tick: if that can exceed + /// the limit, the environment OOMs once the admission lag resolves. This is + /// the general form of the staggered-burst case — admissions that read the + /// same low RSS snapshot before each other's pages are counted. + #[test] + fn staggered_starts_never_exceed_limit_once_resident( + (limit, residents) in arb_fitting_state(500..5000, 20), + schedule in arb_staggered_schedule(), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { + limit, + pinned_usage: 0, + residents, + ..Default::default() + })); + let ctrl = controller(state.clone()); + let source = eviction_source(state.clone(), ctrl.clone()); + + for op in schedule { + match op { + StaggeredOp::Admit(req) => { + apply_staggered_admit(&ctrl, &source, &state, req).await; + } + StaggeredOp::Tick(step) => { + state.lock().unwrap().tick_residency(step); + } + } + // Even mid-flight, measured RSS must never exceed the limit. + let s = state.lock().unwrap(); + prop_assert!( + s.usage() <= s.limit, + "resident usage {} exceeded limit {} mid-schedule", s.usage(), s.limit + ); + } + + // Fault in everything still in flight, then check the eventual + // resident footprint fits. + state.lock().unwrap().tick_residency(u64::MAX); + let s = state.lock().unwrap(); + prop_assert!( + s.eventual_usage() <= s.limit, + "eventual resident usage {} exceeded limit {} once fully resident", + s.eventual_usage(), s.limit + ); + Ok(()) + }).unwrap(); + } +} + +// ── Granted virtual memory ─────────────────────────────────────────────────── + +/// One step of a schedule that stresses granted-but-untouched memory. +#[derive(Debug, Clone)] +enum GrantOp { + /// Attempt to admit a worker granted this many bytes of linear memory. + Grant(u64), + /// Fault in up to this many bytes of the in-flight worker at this index, + /// in isolation from the others. + FaultIn(usize, u64), + /// The in-flight worker at this index finishes and unloads, dropping its + /// resident pages and its remaining grant. + Exit(usize), +} + +fn arb_grant_schedule() -> impl Strategy> { + prop::collection::vec( + prop_oneof![ + 3 => (1u64..800).prop_map(GrantOp::Grant), + 3 => (0usize..20, 1u64..800).prop_map(|(i, step)| GrantOp::FaultIn(i, step)), + 1 => (0usize..20).prop_map(GrantOp::Exit), + ], + 0..80, + ) +} + +proptest! { + /// A worker may fault in the virtual memory it was already granted at any + /// later time, with no admission call in the loop. Once every granted byte + /// of every admitted worker becomes resident, that resident footprint must + /// not exceed the limit. + /// + /// Granted bytes can always become resident — nothing in the runtime forces + /// a worker to leave granted pages untouched — so the safety check is made + /// against the sum of granted sizes after faulting everything in. If that + /// can exceed the limit, a node of workers touching their already-granted + /// pages will OOM with no grow and no admission to intercept it. + #[test] + fn granted_memory_never_exceeds_limit_once_faulted_in( + limit in 800u64..6000, + schedule in arb_grant_schedule(), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { limit, ..Default::default() })); + // usable_ratio 1.0 isolates the granted-memory hole from the host + // carve-out. + let ctrl = controller(state.clone()); + let source = eviction_source(state.clone(), ctrl.clone()); + + for op in schedule { + match op { + GrantOp::Grant(bytes) => { + apply_staggered_admit(&ctrl, &source, &state, bytes).await; + } + GrantOp::FaultIn(index, step) => { + state.lock().unwrap().fault_in_one(index, step); + } + GrantOp::Exit(index) => { + let reserved = state.lock().unwrap().exit_one(index); + if let Some(reserved) = reserved { + ctrl.release(reserved); + } + } + } + let s = state.lock().unwrap(); + prop_assert!( + s.usage() <= s.limit, + "resident usage {} exceeded limit {} mid-schedule", s.usage(), s.limit + ); + } + + // Every granted byte may yet fault in. Once it all does, it must fit. + state.lock().unwrap().tick_residency(u64::MAX); + let s = state.lock().unwrap(); + prop_assert!( + s.eventual_usage() <= s.limit, + "granted memory {} exceeded limit {} once fully faulted in", + s.eventual_usage(), s.limit + ); + Ok(()) + }).unwrap(); + } + + /// Liveness: once every admitted worker has unloaded and its pages have left + /// memory, the gate's admissible headroom must return to the full ceiling. + /// + /// Reservations for workers that exit while still holding untouched granted + /// memory must be released on unload. If they were not, each such exit would + /// permanently shrink headroom, and a node churning workers would slowly + /// refuse all admissions despite being empty. + #[test] + fn headroom_recovers_after_all_workers_exit( + limit in 800u64..6000, + schedule in arb_grant_schedule(), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let usable_ratio = 0.8; + let state = Arc::new(Mutex::new(EnvState { limit, ..Default::default() })); + let ctrl = controller_with_ratio(state.clone(), usable_ratio); + let source = eviction_source(state.clone(), ctrl.clone()); + + for op in schedule { + match op { + GrantOp::Grant(bytes) => { + apply_staggered_admit(&ctrl, &source, &state, bytes).await; + } + GrantOp::FaultIn(index, step) => { + state.lock().unwrap().fault_in_one(index, step); + } + GrantOp::Exit(index) => { + let reserved = state.lock().unwrap().exit_one(index); + if let Some(reserved) = reserved { + ctrl.release(reserved); + } + } + } + } + + // Unload every worker still resident, releasing each reservation, and + // clear measured RSS — the environment is now empty. + loop { + let reserved = state.lock().unwrap().exit_one(0); + match reserved { + Some(reserved) => ctrl.release(reserved), + None => break, + } + } + { + let mut s = state.lock().unwrap(); + s.pinned_usage = 0; + s.residents.clear(); + } + + let ceiling = (limit as f64 * usable_ratio) as u64; + let headroom = ctrl.headroom_bytes(); + prop_assert_eq!( + headroom, ceiling, + "headroom {} did not recover to ceiling {} after all workers exited", + headroom, ceiling + ); + Ok(()) + }).unwrap(); + } +} + +// ── Density ────────────────────────────────────────────────────────────────── + +proptest! { + /// Density invariant: in a settled state (no admission lag outstanding), the + /// gate packs the environment to within one request of the usable ceiling + /// before it starts rejecting. It must not stop admitting while substantial + /// usable room remains. + /// + /// The schedule admits a fixed request size, fully faulting each admitted + /// worker in before the next admit so measured RSS tracks admitted bytes and + /// the in-flight reservation drains to zero — the steady-state regime where + /// density matters. At the first rejection, resident usage must be at least + /// `ceiling - request`: the only room a correct gate may leave free is the + /// part too small to fit one more request. + #[test] + fn admits_to_within_one_request_of_the_ceiling( + limit in 2000u64..20_000, + request in 50u64..600, + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let usable_ratio = 0.8; + let state = Arc::new(Mutex::new(EnvState { + limit, + ..Default::default() + })); + let ctrl = controller_with_ratio(state.clone(), usable_ratio); + let source = eviction_source(state.clone(), ctrl.clone()); + + let ceiling = (limit as f64 * usable_ratio) as u64; + + // Admit until the first rejection, faulting each worker fully in + // before the next so no reservation lag is outstanding. + let mut rejected = false; + for _ in 0..((limit / request) + 2) { + let decision = apply_staggered_admit(&ctrl, &source, &state, request).await; + state.lock().unwrap().tick_residency(u64::MAX); + if decision == AdmissionDecision::Reject { + rejected = true; + break; + } + } + + prop_assert!(rejected, "gate never rejected; ceiling {ceiling} too large for the schedule"); + + let s = state.lock().unwrap(); + prop_assert!( + s.usage() + request > ceiling, + "gate rejected at resident usage {} with ceiling {ceiling}: left more than one request ({request}) of usable room free", + s.usage() + ); + // And it must never have over-committed. + prop_assert!(s.eventual_usage() <= s.limit); + Ok(()) + }).unwrap(); + } +} + // ── Carve-out ratio ────────────────────────────────────────────────────────── #[test] @@ -388,9 +790,7 @@ async fn usable_ratio_caps_admission_below_full_limit() { // ceiling = 0.8 * 1000 = 800. Request 850 must be rejected even though the // raw limit (1000) would allow it — the top 20% is reserved for the host. let ctrl = controller_with_ratio(state.clone(), 0.8); - let source = FakeEvictionSource { - state: state.clone(), - }; + let source = eviction_source(state.clone(), ctrl.clone()); assert_eq!( apply_admit(&ctrl, &source, &state, 850).await, diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs index 0b8e02fa38..b428674a07 100644 --- a/golem-worker-executor/src/services/active_workers/mod.rs +++ b/golem-worker-executor/src/services/active_workers/mod.rs @@ -36,7 +36,6 @@ use component_charge::{ChargeSource, ComponentChargeGuard, ComponentChargeRegist use memory_probe::default_probe; use std::sync::Arc; use std::time::Duration; -use tokio::sync::{Mutex, OwnedSemaphorePermit, Semaphore, TryAcquireError}; use tracing::{Instrument, debug}; @@ -73,26 +72,21 @@ impl RegisteredConcurrentAccount { /// Holds the metadata and wasmtime structures of currently active Golem workers pub struct ActiveWorkers { workers: Cache>, WorkerExecutorError>, - worker_memory: Arc, worker_filesystem_storage: Arc, concurrent_agents: Arc, - priority_allocation_lock: Arc>, acquire_retry_delay: Duration, - /// Authoritative measured-headroom admission gate. Decides whether real - /// memory headroom permits a new acquisition, evicting via the worker set - /// when short, and is what refuses admission in normal operation. The - /// estimate-based `worker_memory` semaphore is the second line of defence - /// behind it: its atomic permit acquisition catches the concurrent - /// admissions the lockless gate can let through on the same snapshot. `None` - /// when measured admission is disabled (e.g. shared test environments) — - /// admission then relies on the estimate semaphore alone. - admission: Option, - /// Charges each resident component's compiled module size to the estimate - /// pool exactly once (shared across all its workers) rather than per worker. - component_charges: - Arc>>, + /// Authoritative measured-headroom admission gate, and the sole admission + /// authority. Decides whether real memory headroom permits a new + /// acquisition, evicting via the worker set when short. `None` when measured + /// admission is disabled (e.g. shared test environments), in which case + /// acquisition always proceeds. + admission: Option>, + /// Reserves each resident component's compiled module size with the gate + /// exactly once (shared across all its workers) rather than per worker, so + /// the module's resident cost is accounted before it faults into memory. + component_charges: Arc>, /// Multiplier applied to a component's `component_size` when sizing its - /// module charge permit. + /// module charge. component_size_coefficient: f64, } @@ -100,98 +94,56 @@ pub struct ActiveWorkers { type ComponentChargeKey = (ComponentId, ComponentRevision); /// Guard held by a resident worker keeping its component's module charge alive. -pub type WorkerComponentCharge = - ComponentChargeGuard>; - -#[derive(Debug)] -pub struct WorkerMemoryPermit { - permit: Option, -} - -impl WorkerMemoryPermit { - fn new(permit: OwnedSemaphorePermit) -> Self { - crate::metrics::workers::record_memory_permit_acquired(permit.num_permits()); - Self { - permit: Some(permit), - } - } - - pub fn num_permits(&self) -> usize { - self.permit - .as_ref() - .map_or(0, |permit| permit.num_permits()) - } - - pub fn merge(&mut self, mut other: Self) { - if let Some(other_permit) = other.permit.take() { - match &mut self.permit { - Some(permit) => permit.merge(other_permit), - None => self.permit = Some(other_permit), - } - } - } -} - -impl Drop for WorkerMemoryPermit { - fn drop(&mut self) { - crate::metrics::workers::record_memory_permit_released(self.num_permits()); - } -} +pub type WorkerComponentCharge = ComponentChargeGuard; impl ActiveWorkers { pub fn new(memory_config: &MemoryConfig, storage_config: &FilesystemStorageConfig) -> Self { - // Build the probe once and size both admission layers from its reported - // limit, so the estimate semaphore and the measured-headroom gate share - // a single basis (the pod's cgroup limit when constrained, not host RAM). + // Build the probe once and hand it to the measured-headroom gate, which + // bases its decision on the pod's cgroup limit when constrained (not host + // RAM). let probe = default_probe(memory_config.system_memory_override); - let worker_memory_size = memory_config.worker_memory_for_limit(probe.limit_bytes()); - let admission = memory_config - .enable_measured_admission - .then(|| AdmissionController::new(probe, memory_config.admission_policy())); + let admission = memory_config.enable_measured_admission.then(|| { + Arc::new(AdmissionController::new( + probe, + memory_config.admission_policy(), + )) + }); let workers = Cache::new( None, FullCacheEvictionMode::None, BackgroundEvictionMode::None, "active_workers", ); - let worker_memory = Arc::new(Semaphore::new(worker_memory_size)); - let priority_allocation_lock = Arc::new(Mutex::new(())); - let component_charges = ComponentChargeRegistry::new(MemoryPoolChargeSource { - worker_memory: worker_memory.clone(), - workers: workers.clone(), - priority_allocation_lock: priority_allocation_lock.clone(), - acquire_retry_delay: memory_config.acquire_retry_delay, + let component_charges = ComponentChargeRegistry::new(GateChargeSource { + admission: admission.clone(), }); let active_workers = Self { workers, - worker_memory, worker_filesystem_storage: Arc::new(FilesystemStorageSemaphore::new( storage_config.worker_filesystem_storage(), storage_config.acquire_retry_delay, )), concurrent_agents: Arc::new(ConcurrentAgentsScheduler::new()), acquire_retry_delay: memory_config.acquire_retry_delay, - priority_allocation_lock, admission, component_charges, component_size_coefficient: memory_config.component_size_coefficient, }; - active_workers.initialize_metrics(worker_memory_size); + active_workers.initialize_metrics(); active_workers } /// Acquire (or share) the per-component module charge for a worker of the - /// given component. The first resident worker of the component pays its - /// compiled-module size (scaled by `component_size_coefficient`) into the - /// estimate pool; subsequent workers share the same charge. The returned - /// guard releases residency on drop, and the charge is freed when the last - /// worker of the component unloads. + /// given component. The first resident worker of the component reserves its + /// compiled-module size (scaled by `component_size_coefficient`) with the + /// gate; subsequent workers share the same charge. The returned guard + /// releases the charge when the last worker of the component unloads. pub async fn acquire_component_charge( &self, component_id: ComponentId, component_revision: ComponentRevision, component_module_bytes: u64, - ) -> WorkerComponentCharge { + ) -> WorkerComponentCharge { let charge_bytes = (self.component_size_coefficient * component_module_bytes as f64) as u64; self.component_charges .acquire((component_id, component_revision), charge_bytes) @@ -270,52 +222,31 @@ impl ActiveWorkers { } } - pub async fn acquire(&self, memory: u64) -> WorkerMemoryPermit { - let mem32: u32 = memory - .try_into() - .expect("requested memory size is too large"); - + /// Blocking memory admission for a starting worker. Loops until the gate + /// admits the request, backing off between attempts. + /// + /// A rejection is transient, not terminal. The gate reads resident memory + /// from the probe, which lags real usage (cgroup `memory.current` only counts + /// already-touched pages), so a worker admitted earlier may not yet be fully + /// resident; pressure eases as its pages settle and as other workers finish. + /// Each iteration backs off and re-reads the gate, so the caller eventually + /// proceeds once headroom recovers rather than failing under momentary + /// pressure. With measured admission disabled the worker is admitted + /// immediately. + pub async fn acquire(&self, memory: u64) { + let Some(admission) = &self.admission else { + return; + }; loop { - // Blocking acquire: retry until the request can be admitted. A - // rejection here is transient, not terminal. The gate reads resident - // memory from the probe, which lags real usage (cgroup - // `memory.current` only counts already-touched pages), so a worker - // admitted earlier may not yet be fully resident; pressure eases as - // its pages settle and as other workers finish and release pool - // permits. Each iteration backs off, re-reads the gate, and re-tries - // the pool, so the caller eventually proceeds once headroom recovers - // rather than failing under momentary pressure. - // Authoritative measured-headroom gate (when enabled). Evicts - // idle-then-warm when real headroom is short; rejects (and we back - // off) when it cannot make room rather than risking the limit. - if let Some(admission) = &self.admission - && admission.try_admit(memory, &self.eviction_source()).await - == AdmissionDecision::Reject - { - debug!("Measured headroom insufficient for {mem32}, backing off and retrying"); - tokio::time::sleep(self.acquire_retry_delay).await; - continue; - } - - // Estimate-semaphore pool: the second line of defence behind the - // gate. Its atomic permit acquisition catches the concurrent - // admissions the lockless gate can let through on the same snapshot. - // Sized above the gate ceiling (but clamped below the limit), so it - // rarely binds first — the gate refuses in normal operation. - if let Some(permit) = acquire_pool_permit( - &self.worker_memory, - &self.workers, - &self.priority_allocation_lock, - self.acquire_retry_delay, - mem32, - memory, - ) - .await + // Evicts idle-then-warm when real headroom is short; rejects (and we + // back off) when it cannot make room rather than risking the limit. + if admission.try_admit(memory, &self.eviction_source()).await + == AdmissionDecision::Admit { - break permit; + return; } - // Pool could not satisfy the estimate even after eviction; loop and - // re-run the gate before trying again. + debug!("Measured headroom insufficient for {memory}, backing off and retrying"); + tokio::time::sleep(self.acquire_retry_delay).await; } } @@ -327,51 +258,31 @@ impl ActiveWorkers { } } - pub async fn try_acquire(&self, memory: u64) -> Option { - let mem32: u32 = memory - .try_into() - .expect("requested memory size is too large"); - - // Authoritative measured-headroom gate (when enabled). Single attempt - // (this is the non-blocking path): if real headroom is insufficient even - // after eviction, do not admit. - if let Some(admission) = &self.admission - && admission.try_admit(memory, &self.eviction_source()).await - == AdmissionDecision::Reject - { - debug!("Measured headroom insufficient for {mem32}, not admitting"); - return None; + /// Non-blocking memory admission for a growing worker. A single gate attempt: + /// returns `true` when the grow is admitted, `false` when real headroom is + /// insufficient even after eviction (the caller turns this into a retriable + /// out-of-memory trap). With measured admission disabled the grow is always + /// admitted. + pub async fn try_acquire(&self, memory: u64) -> bool { + let Some(admission) = &self.admission else { + return true; + }; + match admission.try_admit(memory, &self.eviction_source()).await { + AdmissionDecision::Admit => true, + AdmissionDecision::Reject => { + debug!("Measured headroom insufficient for {memory}, not admitting"); + false + } } + } - let mut lock = None; - loop { - match self.worker_memory.clone().try_acquire_many_owned(mem32) { - Ok(permit) => { - debug!( - "Acquired {} memory of {}", - mem32, - self.worker_memory.available_permits() - ); - break Some(WorkerMemoryPermit::new(permit)); - } - Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"), - Err(TryAcquireError::NoPermits) => { - if lock.is_none() { - debug!( - "Not enough available memory to acquire {mem32} (available: {}), cancelling waiting acquires and retry", - self.worker_memory.available_permits() - ); - lock = Some(self.priority_allocation_lock.lock().await); - continue; - } else { - debug!( - "Not enough available memory to acquire {mem32} (available: {})", - self.worker_memory.available_permits() - ); - break None; - } - } - } + /// Release the memory a worker reserved with the admission gate when it + /// unloads. `bytes` must be the cumulative amount the worker reserved through + /// [`Self::acquire`] and [`Self::try_acquire`], so the gate's granted total + /// stays symmetric. No-op when measured admission is disabled. + pub fn release_memory(&self, bytes: u64) { + if let Some(admission) = &self.admission { + admission.release(bytes); } } @@ -488,12 +399,11 @@ impl ActiveWorkers { } /// Initializes worker gauges. Subsequent changes are recorded inline at the mutation sites. - fn initialize_metrics(&self, worker_memory_size: usize) { + fn initialize_metrics(&self) { crate::metrics::workers::initialize_worker_metrics(); crate::metrics::workers::set_filesystem_semaphore_available( self.worker_filesystem_storage.available_bytes(), ); - crate::metrics::storage::record_worker_memory_pool_total(worker_memory_size as u64); } } @@ -547,62 +457,7 @@ async fn evict_at_most_memory( freed } -/// Frees up to `memory` estimate-permit bytes by evicting idle-then-warm -/// workers, accounting for permits already available. Returns true when enough -/// is (or was already) free. -async fn try_free_up_pool_memory( - worker_memory: &Semaphore, - workers: &Cache>, WorkerExecutorError>, - memory: u64, -) -> bool { - let current_avail = worker_memory.available_permits(); - let needed = memory.saturating_sub(current_avail as u64); - if needed == 0 { - return true; - } - - let mut freed = 0u64; - for priority in [EvictionPriority::Idle, EvictionPriority::Warm] { - if freed >= needed { - break; - } - freed += evict_at_most_memory(workers, priority, needed - freed).await; - } - freed >= needed -} - -/// Single estimate-semaphore acquisition attempt with eviction. Returns the -/// permit on success, or `None` when the pool cannot satisfy `mem32` even after -/// evicting idle/warm workers (caller decides whether to retry). Shared by -/// `ActiveWorkers::acquire` and the per-component charge source so there is one -/// pool-acquire implementation. -async fn acquire_pool_permit( - worker_memory: &Arc, - workers: &Cache>, WorkerExecutorError>, - priority_allocation_lock: &Mutex<()>, - acquire_retry_delay: Duration, - mem32: u32, - memory: u64, -) -> Option { - let lock = priority_allocation_lock.lock().await; // Block trying until a priority request is retrying once - let result = worker_memory.clone().try_acquire_many_owned(mem32); - drop(lock); - match result { - Ok(permit) => Some(WorkerMemoryPermit::new(permit)), - Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"), - Err(TryAcquireError::NoPermits) => { - if try_free_up_pool_memory(worker_memory, workers, memory).await { - // Freed enough; signal the caller to retry the acquire. - None - } else { - // Could not free enough; wait before the caller retries. - tokio::time::sleep(acquire_retry_delay).await; - None - } - } - } -} - +/// A source of evictable, already-resident memory the gate reclaims through. struct WorkerEvictionSource { workers: Cache>, WorkerExecutorError>, } @@ -614,36 +469,41 @@ impl EvictionSource for WorkerEvictionSource { } } -/// Production [`ChargeSource`] for the per-component module charge. Takes -/// estimate-semaphore permits via the same pool acquire+evict path as worker -/// memory (the measured-headroom gate already accounts for the resident module -/// via real RSS, so the charge does not pass through it). -pub struct MemoryPoolChargeSource { - worker_memory: Arc, - workers: Cache>, WorkerExecutorError>, - priority_allocation_lock: Arc>, - acquire_retry_delay: Duration, +/// Production [`ChargeSource`] for the per-component module charge: reserves the +/// module's bytes with the measured-headroom gate. The module is a committed +/// consequence of admitting the first worker of a component (it loads into RAM +/// when that worker becomes resident), so it is reserved rather than admitted — +/// it neither evicts nor can be refused. `None` when measured admission is +/// disabled, in which case the charge is a no-op. +pub struct GateChargeSource { + admission: Option>, +} + +/// Held module charge: releases its reserved bytes from the gate on drop. +pub struct GateCharge { + admission: Option>, + bytes: u64, +} + +impl Drop for GateCharge { + fn drop(&mut self) { + if let Some(admission) = &self.admission { + admission.release(self.bytes); + } + } } #[async_trait] -impl ChargeSource for MemoryPoolChargeSource { - type Charge = WorkerMemoryPermit; +impl ChargeSource for GateChargeSource { + type Charge = GateCharge; - async fn acquire_charge(&self, bytes: u64) -> WorkerMemoryPermit { - let mem32: u32 = bytes.try_into().expect("component charge size too large"); - loop { - if let Some(permit) = acquire_pool_permit( - &self.worker_memory, - &self.workers, - &self.priority_allocation_lock, - self.acquire_retry_delay, - mem32, - bytes, - ) - .await - { - break permit; - } + async fn acquire_charge(&self, bytes: u64) -> GateCharge { + if let Some(admission) = &self.admission { + admission.reserve_committed(bytes); + } + GateCharge { + admission: self.admission.clone(), + bytes, } } } diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs index 82430c243b..074576eb54 100644 --- a/golem-worker-executor/src/services/active_workers/tests.rs +++ b/golem-worker-executor/src/services/active_workers/tests.rs @@ -729,3 +729,114 @@ async fn scheduler_accounts_are_independent() { drop(a1); drop(a2); } + +// ── Component module charge against the admission gate ─────────────────────── + +mod component_module_charge { + use super::super::admission::{AdmissionController, AdmissionPolicy}; + use super::super::component_charge::ComponentChargeRegistry; + use super::super::memory_probe::{MemoryProbe, MemorySnapshot}; + use super::super::{ComponentChargeKey, GateChargeSource, HeldComponentCharge}; + use golem_common::model::component::{ComponentId, ComponentRevision}; + use std::sync::Arc; + use test_r::test; + use uuid::Uuid; + + /// Probe reporting a fixed limit and zero resident memory, so the gate's + /// reservation is driven entirely by what is charged through it. + #[derive(Debug)] + struct FixedProbe { + limit: u64, + } + + impl MemoryProbe for FixedProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.limit, + current_bytes: 0, + } + } + } + + fn key() -> ComponentChargeKey { + (ComponentId(Uuid::new_v4()), ComponentRevision::INITIAL) + } + + /// The first worker of a component reserves the module's bytes with the gate, + /// so admissible headroom drops by the module size before it faults into + /// memory. A second worker of the same component reserves nothing more, and + /// the reservation is released only when the last worker unloads. + #[test] + async fn module_charge_reserves_with_gate_until_last_worker_unloads() { + let limit = 1000u64; + let module_bytes = 200u64; + let controller = Arc::new(AdmissionController::new( + Box::new(FixedProbe { limit }), + AdmissionPolicy { usable_ratio: 1.0 }, + )); + let registry = ComponentChargeRegistry::new(GateChargeSource { + admission: Some(controller.clone()), + }); + let component = key(); + + assert_eq!(controller.headroom_bytes(), limit); + + let first = registry.acquire(component.clone(), module_bytes).await; + assert_eq!( + controller.headroom_bytes(), + limit - module_bytes, + "first worker of a component must reserve the module size with the gate" + ); + + let second = registry.acquire(component.clone(), module_bytes).await; + assert_eq!( + controller.headroom_bytes(), + limit - module_bytes, + "a second worker of the same component must not reserve the module again" + ); + + drop(first); + assert_eq!( + controller.headroom_bytes(), + limit - module_bytes, + "the module stays reserved while any worker of the component is resident" + ); + + drop(second); + assert_eq!( + controller.headroom_bytes(), + limit, + "the module reservation is released when the last worker unloads" + ); + } + + /// A `RunningWorker` stores its component charge as + /// `Box` and releases it by dropping that box when + /// the worker unloads. Dropping the box must still release the module + /// reservation with the gate, i.e. the concrete charge's release runs through + /// the trait object exactly as it would for a live worker. + #[test] + async fn dropping_boxed_charge_releases_the_reservation() { + let limit = 1000u64; + let module_bytes = 200u64; + let controller = Arc::new(AdmissionController::new( + Box::new(FixedProbe { limit }), + AdmissionPolicy { usable_ratio: 1.0 }, + )); + let registry = ComponentChargeRegistry::new(GateChargeSource { + admission: Some(controller.clone()), + }); + + let charge = registry.acquire(key(), module_bytes).await; + // Store it exactly as RunningWorker does. + let boxed: Box = Box::new(charge); + assert_eq!(controller.headroom_bytes(), limit - module_bytes); + + drop(boxed); + assert_eq!( + controller.headroom_bytes(), + limit, + "dropping the boxed charge (as on worker unload) must release the reservation" + ); + } +} diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index 9a53176160..4ff9f0a00c 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -963,30 +963,15 @@ pub struct MemoryConfig { pub system_memory_override: Option, pub worker_memory_ratio: f64, pub worker_estimate_coefficient: f64, - /// Multiplier applied to a component's `component_size`, charged once per - /// resident component (shared across all its workers) rather than per worker. + /// Multiplier applied to a component's `component_size` when reserving its + /// compiled-module memory with the admission gate, charged once per resident + /// component (shared across all its workers) rather than per worker. pub component_size_coefficient: f64, - /// Multiplier (typically > 1.0) applied to the measured limit when sizing the - /// estimate semaphore. The estimate per worker is normally larger than its - /// real resident usage, so the semaphore is allowed to authorize more - /// estimated bytes than the limit: it is the second line of defence behind - /// the measured-headroom gate, catching the concurrent-admission race the - /// (lockless) gate cannot, while the gate refuses first in normal operation - /// against real usage. Always clamped by `worker_memory_max_safe_ratio` so it - /// can never itself authorise real usage past a safe fraction of the limit. - pub worker_memory_overcommit_ratio: f64, - /// Hard upper bound (fraction of the measured limit, < 1.0) on the estimate - /// semaphore size, regardless of `worker_memory_overcommit_ratio`. Keeps the - /// semaphore below the true limit so headroom always remains for the wasmtime - /// host even if the semaphore is the binding guard and estimates happen to - /// match real usage. - pub worker_memory_max_safe_ratio: f64, /// Whether the measured-headroom admission gate is active. Requires the /// executor to own its memory environment (its own cgroup/process), as in a /// production pod. Disable in shared environments — such as the in-process /// test harness — where the probe cannot isolate this executor's footprint - /// from co-resident processes; admission then relies on the estimate - /// semaphore alone. + /// from co-resident processes. pub enable_measured_admission: bool, #[serde(with = "humantime_serde")] pub acquire_retry_delay: Duration, @@ -1010,24 +995,6 @@ impl MemoryConfig { sysinfo.available_memory() } - /// Size of the estimate semaphore: the measured limit scaled by the - /// overcommit ratio, then clamped to `worker_memory_max_safe_ratio` of the - /// limit. The overcommit lets the semaphore sit slightly above the gate - /// ceiling as a second line of defence (per-worker estimates exceed real - /// usage, so it rarely binds first); the clamp guarantees it can never be - /// sized to authorise real usage past a safe fraction of the limit, leaving - /// headroom for the wasmtime host. - pub fn worker_memory_for_limit(&self, limit_bytes: u64) -> usize { - let limit = limit_bytes as f64; - let overcommit = limit * self.worker_memory_overcommit_ratio; - let safe_cap = limit * self.worker_memory_max_safe_ratio; - overcommit.min(safe_cap) as usize - } - - pub fn worker_memory(&self) -> usize { - self.worker_memory_for_limit(self.total_system_memory()) - } - /// The admission policy for the measured-headroom gate. Reuses /// `worker_memory_ratio` as the usable fraction of the measured limit (the /// host keeps the remainder). @@ -1059,16 +1026,6 @@ impl SafeDisplay for MemoryConfig { "component size coefficient: {}", self.component_size_coefficient ); - let _ = writeln!( - &mut result, - "worker memory overcommit ratio: {}", - self.worker_memory_overcommit_ratio - ); - let _ = writeln!( - &mut result, - "worker memory max safe ratio: {}", - self.worker_memory_max_safe_ratio - ); let _ = writeln!( &mut result, "measured admission enabled: {}", @@ -1599,8 +1556,6 @@ impl Default for MemoryConfig { worker_memory_ratio: 0.8, worker_estimate_coefficient: 1.1, component_size_coefficient: 2.0, - worker_memory_overcommit_ratio: 1.2, - worker_memory_max_safe_ratio: 0.9, enable_measured_admission: true, acquire_retry_delay: Duration::from_millis(500), oom_retry_config: RetryConfig { diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs index efd692f7c4..bd1ead3243 100644 --- a/golem-worker-executor/src/worker/mod.rs +++ b/golem-worker-executor/src/worker/mod.rs @@ -28,7 +28,7 @@ use crate::metrics::storage::record_filesystem_pool_released; use crate::model::{AgentConfig, ExecutionStatus, LookupResult, ReadFileResult, TrapType}; use crate::services::active_workers::{ FilesystemStoragePermit, HeldComponentCharge, RegisteredConcurrentAccount, - WorkerComponentCharge, WorkerMemoryPermit, + WorkerComponentCharge, }; use crate::services::events::{Event, EventsSubscription}; use crate::services::golem_config::SnapshotPolicy; @@ -137,6 +137,11 @@ pub struct Worker { /// at least that many bytes from the blocking eviction path, ensuring /// enough idle workers are evicted to satisfy the pending write. desired_extra_filesystem_storage: AtomicU64, + /// Cumulative memory bytes this worker has reserved with the admission gate: + /// its initial requirement plus every grow delta. Released back to the gate + /// in full when the worker unloads, so the gate's granted total stays exactly + /// symmetric with what was reserved. + granted_memory: AtomicU64, } impl HasOplog for Worker { @@ -349,6 +354,7 @@ impl Worker { last_resume_request: Mutex::new(Timestamp::now_utc()), snapshot_recovery_disabled: AtomicBool::new(false), desired_extra_filesystem_storage: AtomicU64::new(0), + granted_memory: AtomicU64::new(0), }; // Wire the worker event service into the forwarding oplog so plugin errors @@ -985,12 +991,11 @@ impl Worker { // Should only be called from invocation loop pub async fn increase_memory(&self, delta: u64) -> anyhow::Result<()> { - // The instance lock must not be held while acquiring memory permits: - // permit acquisition runs the admission eviction scan, which takes other - // workers' instance locks. Holding this worker's instance lock across - // that scan while another growing worker does the same is an AB-BA - // deadlock. So acquire the permit without the lock, then re-lock only to - // merge it into the running worker. + // The instance lock must not be held while running the admission gate: + // it may run the eviction scan, which takes other workers' instance + // locks. Holding this worker's instance lock across that scan while + // another growing worker does the same is an AB-BA deadlock. So check the + // state, release the lock, then run the gate. match &*self.instance.lock().await { WorkerInstance::Running(_) => {} WorkerInstance::Stopping(_) @@ -999,23 +1004,22 @@ impl Worker { | WorkerInstance::Deleting => return Ok(()), } - let Some(new_permits) = self.active_workers().try_acquire(delta).await else { - return Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory)); - }; + if self.active_workers().try_acquire(delta).await { + self.granted_memory.fetch_add(delta, Ordering::Relaxed); + Ok(()) + } else { + Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory)) + } + } - // Re-check state under the lock: the worker may have changed state while - // permits were being acquired. If it is no longer running, drop the - // permits (returned to the pool on drop) and treat as a no-op, matching - // the non-running arms above. - match &mut *self.instance.lock().await { - WorkerInstance::Running(running) => { - running.merge_extra_permits(new_permits); - Ok(()) - } - WorkerInstance::Stopping(_) - | WorkerInstance::WaitingForPermit(_) - | WorkerInstance::Unloaded { .. } - | WorkerInstance::Deleting => Ok(()), + /// Release this worker's entire accumulated memory grant back to the + /// admission gate, resetting the running total to zero. Called when the + /// worker stops being resident; a later reload re-accumulates the grant from + /// scratch through the acquire path. + fn release_granted_memory(&self) { + let granted = self.granted_memory.swap(0, Ordering::Relaxed); + if granted > 0 { + self.active_workers().release_memory(granted); } } @@ -1667,14 +1671,15 @@ impl Worker { // when stopping via the invocation loop we can stop immediately, no need to go via the stopping status if called_from_invocation_loop { crate::metrics::workers::dec_worker_memory_resident(); + self.release_granted_memory(); **instance_guard = final_state.into_instance(); StopResult::Stopped } else { // drop the running worker, this signals to the invocation loop to start exiting. - // RunningWorker::drop releases the memory permit, so dec resident here. let run_loop_handle = running.stop(); let notify = OneShotEvent::new(); crate::metrics::workers::dec_worker_memory_resident(); + self.release_granted_memory(); **instance_guard = WorkerInstance::Stopping(StoppingWorker { notify: notify.clone(), final_state, @@ -2223,8 +2228,7 @@ impl Worker { async fn start_waiting_worker( this: Arc>, - permit: WorkerMemoryPermit, - component_charge: WorkerComponentCharge, + component_charge: WorkerComponentCharge, filesystem_storage_permit: Option, concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit, oom_retry_count: u32, @@ -2239,7 +2243,6 @@ impl Worker { this.owned_agent_id.clone(), this.queue.clone(), this.clone(), - permit, component_charge, concurrent_agent_permit, oom_retry_count, @@ -2254,6 +2257,9 @@ impl Worker { } _ => { debug!("worker was not waiting for permit anymore, not starting"); + // The grant was reserved before this call; the worker is not + // becoming resident, so release it rather than leak it. + this.release_granted_memory(); } } } @@ -2391,13 +2397,17 @@ impl WaitingWorker { let agent_id = parent.owned_agent_id.agent_id(); let registered_concurrent_account = parent.registered_concurrent_account.clone(); let concurrent_agent_permit = registered_concurrent_account.acquire(agent_id).await; - // Do not reserve executor memory while waiting for a per-account - // concurrency slot. Otherwise one account could fill the memory - // pool with workers that are not allowed to run yet. - let permit = parent.active_workers().acquire(memory_requirement).await; - // Charge the component's compiled module size once per resident + // Do not gate executor memory while waiting for a per-account + // concurrency slot. Otherwise one account could exhaust the + // memory headroom with workers that are not allowed to run yet. + parent.active_workers().acquire(memory_requirement).await; + parent + .granted_memory + .fetch_add(memory_requirement, Ordering::Relaxed); + // Reserve the component's compiled module size once per resident // component (shared by all its workers). Held for as long as this - // worker is resident. + // worker is resident; the module faults into RAM when the first + // worker loads, so reserving it keeps later admissions honest. let component_charge = match parent.component_charge_requirement().await { Ok((component_id, component_revision, component_module_bytes)) => { parent @@ -2413,6 +2423,7 @@ impl WaitingWorker { warn!( "Failed to determine component charge requirement, not starting: {err}" ); + parent.release_granted_memory(); return; } }; @@ -2466,7 +2477,6 @@ impl WaitingWorker { debug!("Attempting to start worker after acquiring enough permits"); Worker::start_waiting_worker( parent, - permit, component_charge, filesystem_storage_permit, concurrent_agent_permit, @@ -2499,11 +2509,9 @@ struct RunningWorker { handle: Option>, sender: UnboundedSender, queue: Arc>>, - permit: WorkerMemoryPermit, - /// Keeps this worker's component module charge alive for as long as the - /// worker is resident. Held only to be dropped: dropping it releases the - /// component's residency, and the module charge if this was the last worker - /// of the component. + /// Keeps this worker's component module charge alive while it is resident. + /// Held only to be dropped: dropping it releases the component's residency + /// (and the module reservation if this was the last worker of the component). #[allow(dead_code)] component_charge: Box, /// Storage semaphore permits held by this worker. `None` until storage @@ -2536,8 +2544,7 @@ impl RunningWorker { owned_agent_id: OwnedAgentId, queue: Arc>>, parent: Arc>, - permit: WorkerMemoryPermit, - component_charge: WorkerComponentCharge, + component_charge: WorkerComponentCharge, concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit, oom_retry_count: u32, ) -> Self { @@ -2587,7 +2594,6 @@ impl RunningWorker { handle: Some(handle), sender, queue, - permit, component_charge: Box::new(component_charge), filesystem_storage_permit: None, waiting_for_command, @@ -2596,10 +2602,6 @@ impl RunningWorker { } } - pub fn merge_extra_permits(&mut self, extra_permit: WorkerMemoryPermit) { - self.permit.merge(extra_permit); - } - /// Merge additional storage permits into this worker's storage permit. If /// the worker does not yet hold a storage permit, the given permit becomes /// the initial one. Additional calls merge into that initial permit. From 8cecf91c4da8e1be6af64007897e98e2b062a75b Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Wed, 10 Jun 2026 16:34:26 -0700 Subject: [PATCH 38/60] fix: clippy warnings --- golem-worker-executor/src/services/active_workers/tests.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs index 074576eb54..53481b4d18 100644 --- a/golem-worker-executor/src/services/active_workers/tests.rs +++ b/golem-worker-executor/src/services/active_workers/tests.rs @@ -781,14 +781,14 @@ mod component_module_charge { assert_eq!(controller.headroom_bytes(), limit); - let first = registry.acquire(component.clone(), module_bytes).await; + let first = registry.acquire(component, module_bytes).await; assert_eq!( controller.headroom_bytes(), limit - module_bytes, "first worker of a component must reserve the module size with the gate" ); - let second = registry.acquire(component.clone(), module_bytes).await; + let second = registry.acquire(component, module_bytes).await; assert_eq!( controller.headroom_bytes(), limit - module_bytes, From 8566f132bb02bce7a3833b48edd29324b983abd5 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Wed, 10 Jun 2026 19:06:27 -0700 Subject: [PATCH 39/60] fix: startup message regarding memory --- golem-debugging-service/src/lib.rs | 13 +++++++++---- golem-worker-executor/src/lib.rs | 12 ++++++++---- .../src/services/golem_config.rs | 16 ---------------- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/golem-debugging-service/src/lib.rs b/golem-debugging-service/src/lib.rs index 71e3aeac9c..2dea553b6b 100644 --- a/golem-debugging-service/src/lib.rs +++ b/golem-debugging-service/src/lib.rs @@ -375,14 +375,19 @@ pub async fn run_debug_worker_executor + ?Sized + Sen ) -> anyhow::Result { debug!("Initializing debug worker executor"); - let total_system_memory = golem_config.memory.total_system_memory(); - let system_memory = golem_config.memory.system_memory(); + let memory_snapshot = + golem_worker_executor::services::active_workers::memory_probe::default_probe( + golem_config.memory.system_memory_override, + ) + .snapshot(); + let total_system_memory = memory_snapshot.limit_bytes; + let used_system_memory = memory_snapshot.current_bytes; let worker_memory = (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64; info!( - "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}", + "Measured memory limit: {}, Currently used: {}, Usable for workers: {}", ISizeFormatter::new(total_system_memory, humansize::BINARY), - ISizeFormatter::new(system_memory, humansize::BINARY), + ISizeFormatter::new(used_system_memory, humansize::BINARY), ISizeFormatter::new(worker_memory, humansize::BINARY) ); diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs index 1eedc9f5e1..f2df280bff 100644 --- a/golem-worker-executor/src/lib.rs +++ b/golem-worker-executor/src/lib.rs @@ -1000,14 +1000,18 @@ pub async fn bootstrap_and_run_worker_executor< ) -> anyhow::Result { debug!("Initializing worker executor"); - let total_system_memory = golem_config.memory.total_system_memory(); - let system_memory = golem_config.memory.system_memory(); + let memory_snapshot = crate::services::active_workers::memory_probe::default_probe( + golem_config.memory.system_memory_override, + ) + .snapshot(); + let total_system_memory = memory_snapshot.limit_bytes; + let used_system_memory = memory_snapshot.current_bytes; let worker_memory = (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64; info!( - "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}", + "Measured memory limit: {}, Currently used: {}, Usable for workers: {}", ISizeFormatter::new(total_system_memory, BINARY), - ISizeFormatter::new(system_memory, BINARY), + ISizeFormatter::new(used_system_memory, BINARY), ISizeFormatter::new(worker_memory, BINARY) ); diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index 4ff9f0a00c..a11a411f77 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -979,22 +979,6 @@ pub struct MemoryConfig { } impl MemoryConfig { - /// The memory limit this executor must stay under, resolved through the same - /// probe the admission gate uses: the cgroup `memory.max` of the pod on a - /// constrained Linux deployment, the configured override when set, and host - /// RAM only when the process is genuinely unconstrained. In a container this - /// is the pod's ceiling, not the host's total RAM. - pub fn total_system_memory(&self) -> u64 { - crate::services::active_workers::memory_probe::default_probe(self.system_memory_override) - .limit_bytes() - } - - pub fn system_memory(&self) -> u64 { - let mut sysinfo = sysinfo::System::new(); - sysinfo.refresh_memory(); - sysinfo.available_memory() - } - /// The admission policy for the measured-headroom gate. Reuses /// `worker_memory_ratio` as the usable fraction of the measured limit (the /// host keeps the remainder). From 626e4bac29b50894d67aea0bd6135cb194d3e91f Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:25:46 -0700 Subject: [PATCH 40/60] chore: run only oom test --- .../cloud-density-saturation.yaml | 70 +++++++++---------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml index 78b0064fa2..1d7a477661 100644 --- a/integration-tests/benchmark_suites/cloud-density-saturation.yaml +++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml @@ -28,41 +28,41 @@ name: cloud-density-saturation benchmarks: - # Rust echo agents — lean per-instance linear memory (the ~900 KB module is - # charged once per component, shared across all agents; what scales per agent - # is the small instance heap). The previous run reached the top of the sweep - # (12000) without saturating pod memory, so the knee here is throughput / - # eviction-churn rather than memory. Dropped the low points that told us - # nothing and pushed the range up with coarser steps. - - name: throughput-saturation-echo-rust - iterations: 3 - clusterSize: [2] - size: [2000, 3000, 4000, 5000, 10000, 15000, 20000] - length: [0] - - # TypeScript echo agents — each instance instantiates its own QuickJS runtime - # and JS heap in its own linear memory (the 17.4 MB module is shared once per - # component; the per-instance runtime state is the heavy per-agent cost). - # Heavier per agent than the Rust variant, so a lower knee — but the previous - # run reached 2000 without saturating, so push higher and drop the low points. - - name: throughput-saturation-echo-ts - iterations: 3 - clusterSize: [2] - size: [1000, 2000, 3000] - length: [0] + # # Rust echo agents — lean per-instance linear memory (the ~900 KB module is + # # charged once per component, shared across all agents; what scales per agent + # # is the small instance heap). The previous run reached the top of the sweep + # # (12000) without saturating pod memory, so the knee here is throughput / + # # eviction-churn rather than memory. Dropped the low points that told us + # # nothing and pushed the range up with coarser steps. + # - name: throughput-saturation-echo-rust + # iterations: 3 + # clusterSize: [2] + # size: [2000, 3000, 4000, 5000, 10000, 15000, 20000] + # length: [0] - # # Synthetic footprint — each agent retains a deterministic per-agent-distinct - # # amount of resident memory, exercising the admission/eviction path with a - # # controllable footprint near the limit. Run first: this is the variant that - # # actually fills memory and drives the gate to its reject/evict path. - # # size = number of active, memory-holding agents (the ramp axis) - # # length = base per-agent memory footprint in bytes; each agent retains a - # # deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base => - # # ~72 MiB average per agent, filling a ~10 GiB usable pool around - # # ~145 agents. The sweep brackets that ceiling and pushes well past it - # # so the admission gate's reject/evict behaviour near OOM is exercised. - # - name: throughput-saturation-counters + # # TypeScript echo agents — each instance instantiates its own QuickJS runtime + # # and JS heap in its own linear memory (the 17.4 MB module is shared once per + # # component; the per-instance runtime state is the heavy per-agent cost). + # # Heavier per agent than the Rust variant, so a lower knee — but the previous + # # run reached 2000 without saturating, so push higher and drop the low points. + # - name: throughput-saturation-echo-ts # iterations: 3 # clusterSize: [2] - # size: [50, 100, 150, 200, 300] - # length: [16777216] + # size: [1000, 2000, 3000] + # length: [0] + + # Synthetic footprint — each agent retains a deterministic per-agent-distinct + # amount of resident memory, exercising the admission/eviction path with a + # controllable footprint near the limit. Run first: this is the variant that + # actually fills memory and drives the gate to its reject/evict path. + # size = number of active, memory-holding agents (the ramp axis) + # length = base per-agent memory footprint in bytes; each agent retains a + # deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base => + # ~72 MiB average per agent, filling a ~10 GiB usable pool around + # ~145 agents. The sweep brackets that ceiling and pushes well past it + # so the admission gate's reject/evict behaviour near OOM is exercised. + - name: throughput-saturation-counters + iterations: 1 + clusterSize: [2] + size: [50, 100, 150, 200, 300] + length: [16777216] From 24340479cc2edb573f22ee34dc675b9801b2bfae Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Wed, 10 Jun 2026 21:05:46 -0700 Subject: [PATCH 41/60] feat: enable to whole perf test suite --- .../benchmark_suites/cloud-perf.yaml | 174 +++++++++--------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index 21ef48352a..ea8ce74403 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -29,102 +29,102 @@ benchmarks: # size = number of workers per implementation (×6 implementations total) # length = unused for echo - name: throughput-echo - iterations: 3 + iterations: 1 clusterSize: [2] - size: [1, 10, 50, 100, 250] + size: [1, 50, 100, 250] length: [1000] - # # size = number of workers per implementation - # # length = payload size in bytes sent to large_input - # # NOTE: large payloads grow worker linear memory, so this is the throughput - # # benchmark most relevant to the memory-admission investigation — sized to - # # match throughput-echo so it exercises real density. - # - name: throughput-large-input - # iterations: 3 - # clusterSize: [2] - # size: [1, 50, 100, 250] - # length: [100, 10000] + # size = number of workers per implementation + # length = payload size in bytes sent to large_input + # NOTE: large payloads grow worker linear memory, so this is the throughput + # benchmark most relevant to the memory-admission investigation — sized to + # match throughput-echo so it exercises real density. + - name: throughput-large-input + iterations: 1 + clusterSize: [2] + size: [1, 50, 100, 250] + length: [100, 10000] - # # size = number of workers per implementation - # # length = CPU work length passed to cpu_intensive - # - name: throughput-cpu-intensive - # iterations: 3 - # clusterSize: [2] - # size: [1, 50, 100, 250] - # length: [100] + # size = number of workers per implementation + # length = CPU work length passed to cpu_intensive + - name: throughput-cpu-intensive + iterations: 1 + clusterSize: [2] + size: [1, 50, 100, 250] + length: [100] - # # Cold-start: compilation cache disabled — measures true cold-start latency - # # with no warm compiled artefact available. - # # size = number of unique components created (each in its own env) - # # length = seconds to wait per component for pre-compilation warm-up - # - name: cold-start-unknown-small - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 25, 50] - # length: [2] - # disableCompilationCache: true + # Cold-start: compilation cache disabled — measures true cold-start latency + # with no warm compiled artefact available. + # size = number of unique components created (each in its own env) + # length = seconds to wait per component for pre-compilation warm-up + - name: cold-start-unknown-small + iterations: 1 + clusterSize: [2] + size: [1, 5, 25, 50] + length: [2] + disableCompilationCache: true - # - name: cold-start-unknown-medium - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 25, 50] - # length: [5] - # disableCompilationCache: true + - name: cold-start-unknown-medium + iterations: 1 + clusterSize: [2] + size: [1, 5, 25, 50] + length: [5] + disableCompilationCache: true - # # Cold-start: compilation cache enabled — measures latency once the compiled - # # artefact is available in the cache. - # # size = number of unique components created (each in its own env) - # # length = seconds to wait per component for pre-compilation warm-up - # # NOTE: if results here are close to the cache-disabled entries above, the - # # warm-up wait is too short and compilation hasn't finished — bump length. - # - name: cold-start-unknown-small - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 25, 50] - # length: [2] + # Cold-start: compilation cache enabled — measures latency once the compiled + # artefact is available in the cache. + # size = number of unique components created (each in its own env) + # length = seconds to wait per component for pre-compilation warm-up + # NOTE: if results here are close to the cache-disabled entries above, the + # warm-up wait is too short and compilation hasn't finished — bump length. + - name: cold-start-unknown-small + iterations: 1 + clusterSize: [2] + size: [1, 5, 25, 50] + length: [2] - # - name: cold-start-unknown-medium - # iterations: 3 - # clusterSize: [2] - # size: [1, 5, 10, 25, 50] - # length: [5] + - name: cold-start-unknown-medium + iterations: 1 + clusterSize: [2] + size: [1, 5, 25, 50] + length: [5] - # # Invocation latency — hot and cold paths through the Gateway NLB. - # # Large worker counts to stress the load balancer and connection pool. - # # size = number of workers created - # # length = number of hot invocations per worker after the first cold one - # - name: latency-small - # iterations: 3 - # clusterSize: [2] - # size: [100, 500, 1000, 2000, 5000] - # length: [2] + # Invocation latency — hot and cold paths through the Gateway NLB. + # Large worker counts to stress the load balancer and connection pool. + # size = number of workers created + # length = number of hot invocations per worker after the first cold one + - name: latency-small + iterations: 1 + clusterSize: [2] + size: [100, 500, 1000, 2000, 5000] + length: [2] - # - name: latency-medium - # iterations: 3 - # clusterSize: [2] - # size: [100, 500, 1000, 2000] - # length: [5] + - name: latency-medium + iterations: 1 + clusterSize: [2] + size: [100, 500, 1000, 2000] + length: [5] - # # Sleep — measures worker suspension and resumption under real network - # # conditions. High residency: all `size` workers held in memory sleeping at - # # once, so this also probes how many resident workers fit (memory-admission - # # relevant) — pushed past the ~2000 echo proved out. - # # size = number of workers launched in parallel - # # length = sleep duration in milliseconds - # - name: sleep - # iterations: 3 - # clusterSize: [2] - # size: [10, 100, 500, 1000, 2000] - # length: [10000] + # Sleep — measures worker suspension and resumption under real network + # conditions. High residency: all `size` workers held in memory sleeping at + # once, so this also probes how many resident workers fit (memory-admission + # relevant) — pushed past the ~2000 echo proved out. + # size = number of workers launched in parallel + # length = sleep duration in milliseconds + - name: sleep + iterations: 1 + clusterSize: [2] + size: [10, 100, 500, 1000, 2000] + length: [10000] - # # Durability overhead — measures the cost of durable vs ephemeral execution - # # across four variants (durable-persistent, durable-non-persistent, - # # ephemeral, durable-persistent-commit). size workers concurrent per phase; - # # sized up to put real load on the oplog/persistence/storage path. - # # size = number of workers per variant - # # length = loop iteration count passed to oplog_heavy - # - name: durability-overhead - # iterations: 3 - # clusterSize: [2] - # size: [10, 50, 100, 250] - # length: [5000] + # Durability overhead — measures the cost of durable vs ephemeral execution + # across four variants (durable-persistent, durable-non-persistent, + # ephemeral, durable-persistent-commit). size workers concurrent per phase; + # sized up to put real load on the oplog/persistence/storage path. + # size = number of workers per variant + # length = loop iteration count passed to oplog_heavy + - name: durability-overhead + iterations: 1 + clusterSize: [2] + size: [10, 50, 100, 250] + length: [5000] From a8fcf52168155fa2d3f33291ecbf86727a348087 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Wed, 10 Jun 2026 22:43:04 -0700 Subject: [PATCH 42/60] feat: more metrics plus FixedProbe for tests --- golem-worker-executor-test-utils/src/lib.rs | 44 ++++++++++++++++--- golem-worker-executor/src/lib.rs | 17 +++++-- golem-worker-executor/src/metrics.rs | 18 ++++++++ .../services/active_workers/memory_probe.rs | 32 ++++++++++++++ .../src/services/active_workers/mod.rs | 14 +++++- golem-worker-executor/src/worker/mod.rs | 1 + 6 files changed, 116 insertions(+), 10 deletions(-) diff --git a/golem-worker-executor-test-utils/src/lib.rs b/golem-worker-executor-test-utils/src/lib.rs index fcfb661670..6d86ffb7a7 100644 --- a/golem-worker-executor-test-utils/src/lib.rs +++ b/golem-worker-executor-test-utils/src/lib.rs @@ -82,6 +82,7 @@ use golem_worker_executor::preview2::golem::agent::host::{ }; use golem_worker_executor::preview2::{golem_api_1_x, golem_durability}; use golem_worker_executor::services::active_workers::ActiveWorkers; +use golem_worker_executor::services::active_workers::memory_probe::FixedProbe; use golem_worker_executor::services::agent_types::AgentTypesService; use golem_worker_executor::services::agent_webhooks::AgentWebhooksService; use golem_worker_executor::services::blob_store::{ @@ -536,8 +537,9 @@ fn make_base_test_config(deps: &WorkerExecutorTestDependencies) -> GolemConfig { // The measured-headroom admission gate requires the executor to own its // memory environment (cgroup/process). The in-process test harness runs // the executor alongside the test framework and other services, so the - // probe cannot isolate this executor's footprint — disable it and gate on - // the estimate semaphore alone, matching pre-gate behaviour. + // probe cannot isolate this executor's footprint. Disable the gate so + // admission always proceeds and tests are not subject to a memory limit + // derived from the shared host. memory: MemoryConfig { enable_measured_admission: false, ..Default::default() @@ -705,9 +707,16 @@ pub async fn start_customized( apply_sqlite_storage_config(&mut config, deps, context); config.memory = MemoryConfig { system_memory_override, - // Measured admission disabled in the shared in-process test harness; the - // small system_memory_override here drives the estimate semaphore alone. - enable_measured_admission: false, + // Enable the measured-headroom gate when a test pins a memory limit, so + // memory-pressure tests exercise the real admission controller under that + // limit. The test bootstrap (create_active_workers) feeds the gate a + // fixed probe reporting this limit with zero current usage, so admission + // is decided on the granted accounting against the pinned limit and is + // not perturbed by the shared test process's RSS. Otherwise the gate is + // disabled (see make_base_test_config). The usable ratio + // (worker_memory_ratio, default 0.8) applies, matching the pre-gate + // semaphore pool size of system_memory_override * ratio. + enable_measured_admission: system_memory_override.is_some(), ..Default::default() }; config.filesystem_storage = FilesystemStorageConfig { @@ -1370,6 +1379,31 @@ impl InvocationContextManagement for TestWorkerCtx { #[async_trait] impl Bootstrap for TestServerBootstrap { + fn create_active_workers( + &self, + golem_config: &GolemConfig, + ) -> Arc> { + // The in-process test harness shares its process (and RSS) with the test + // framework and other services, so a process-RSS probe cannot isolate + // this executor's footprint. When a test pins a memory limit via + // system_memory_override, give the gate a fixed probe reporting that + // limit with zero current usage, so admission is decided solely on the + // granted accounting (exact and process-isolated) against the pinned + // limit. The usable_ratio (worker_memory_ratio) still applies, matching + // the pre-gate semaphore pool size of system_memory_override * ratio. + match golem_config.memory.system_memory_override { + Some(limit) => Arc::new(ActiveWorkers::new_with_probe( + Box::new(FixedProbe::new(limit, 0)), + &golem_config.memory, + &golem_config.filesystem_storage, + )), + None => Arc::new(ActiveWorkers::new( + &golem_config.memory, + &golem_config.filesystem_storage, + )), + } + } + fn create_shard_manager_service( &self, _shard_manager_client: Arc, diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs index f2df280bff..8b05e6e71c 100644 --- a/golem-worker-executor/src/lib.rs +++ b/golem-worker-executor/src/lib.rs @@ -161,6 +161,18 @@ impl Drop for RunDetails { #[async_trait] #[allow(clippy::too_many_arguments)] pub trait Bootstrap { + /// Creates the [`ActiveWorkers`] service, including the measured-headroom + /// admission gate. The default builds the memory probe from the config + /// (cgroup/process/override). The in-process test harness overrides this to + /// inject a probe with a pinned limit and usage so the gate is deterministic + /// and isolated from the shared test process's RSS. + fn create_active_workers(&self, golem_config: &GolemConfig) -> Arc> { + Arc::new(ActiveWorkers::::new( + &golem_config.memory, + &golem_config.filesystem_storage, + )) + } + fn create_shard_manager_service( &self, shard_manager_client: Arc, @@ -769,10 +781,7 @@ pub async fn create_worker_executor_impl< } }; - let active_workers = Arc::new(ActiveWorkers::::new( - &golem_config.memory, - &golem_config.filesystem_storage, - )); + let active_workers = bootstrap.create_active_workers(&golem_config); let file_loader = Arc::new(FileLoader::new( initial_files_service.clone(), diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs index b611f9985b..0e009c0705 100644 --- a/golem-worker-executor/src/metrics.rs +++ b/golem-worker-executor/src/metrics.rs @@ -208,6 +208,21 @@ pub mod workers { &["executor_id"] ) .unwrap(); + pub static ref WORKER_MEMORY_GROW_REJECTED_TOTAL: CounterVec = register_counter_vec!( + "golem_worker_memory_grow_rejected_total", + "Invocations interrupted because a worker's linear-memory grow could not be admitted by the gate (out-of-memory trap, retried via reacquire)", + &["executor_id"] + ) + .unwrap(); + } + + /// Counts one invocation interrupted because a linear-memory grow was + /// refused by the admission gate (the worker traps out-of-memory and is + /// restarted to reacquire memory). + pub fn record_worker_memory_grow_rejected() { + WORKER_MEMORY_GROW_REJECTED_TOTAL + .with_label_values(&[crate::metrics::storage::executor_id()]) + .inc(); } /// Sets the gate's usable memory ceiling gauge. @@ -269,6 +284,9 @@ pub mod workers { WORKER_WAITING_FOR_MEMORY_COUNT .with_label_values(&[id]) .set(0.0); + WORKER_MEMORY_GROW_REJECTED_TOTAL + .with_label_values(&[id]) + .inc_by(0.0); } pub fn inc_worker_memory_resident() { diff --git a/golem-worker-executor/src/services/active_workers/memory_probe.rs b/golem-worker-executor/src/services/active_workers/memory_probe.rs index 6a26b3dd25..6940b53db4 100644 --- a/golem-worker-executor/src/services/active_workers/memory_probe.rs +++ b/golem-worker-executor/src/services/active_workers/memory_probe.rs @@ -100,6 +100,38 @@ impl MemoryProbe for ProcessRssProbe { } } +/// A probe with a fixed limit and a fixed current usage, both set at +/// construction. Reports the same snapshot on every call regardless of the +/// host. Used by the in-process test harness, where the executor shares its +/// process (and therefore its real RSS) with the test framework and other +/// services, so a process-RSS probe cannot isolate this executor's footprint. +/// Pinning `current_bytes` to a known value (typically 0) makes the gate decide +/// purely on the granted accounting against the pinned limit, which is exact and +/// process-isolated, so memory-pressure tests are deterministic. +#[derive(Debug)] +pub struct FixedProbe { + limit_bytes: u64, + current_bytes: u64, +} + +impl FixedProbe { + pub fn new(limit_bytes: u64, current_bytes: u64) -> Self { + Self { + limit_bytes, + current_bytes, + } + } +} + +impl MemoryProbe for FixedProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.limit_bytes, + current_bytes: self.current_bytes, + } + } +} + /// Linux cgroup v2 probe. Reads `memory.max` and `memory.current` from the /// process's cgroup. #[cfg(target_os = "linux")] diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs index b428674a07..4ac70f8744 100644 --- a/golem-worker-executor/src/services/active_workers/mod.rs +++ b/golem-worker-executor/src/services/active_workers/mod.rs @@ -33,7 +33,7 @@ use admission::{AdmissionController, AdmissionDecision, EvictionPriority, Evicti use async_trait::async_trait; pub use component_charge::HeldComponentCharge; use component_charge::{ChargeSource, ComponentChargeGuard, ComponentChargeRegistry}; -use memory_probe::default_probe; +use memory_probe::{MemoryProbe, default_probe}; use std::sync::Arc; use std::time::Duration; @@ -102,6 +102,18 @@ impl ActiveWorkers { // bases its decision on the pod's cgroup limit when constrained (not host // RAM). let probe = default_probe(memory_config.system_memory_override); + Self::new_with_probe(probe, memory_config, storage_config) + } + + /// Like [`Self::new`] but with an explicitly provided memory probe instead of + /// the one derived from the config. The in-process test harness uses this to + /// supply a probe with a pinned limit and current usage, so the gate's + /// decision is deterministic and isolated from the shared test process's RSS. + pub fn new_with_probe( + probe: Box, + memory_config: &MemoryConfig, + storage_config: &FilesystemStorageConfig, + ) -> Self { let admission = memory_config.enable_measured_admission.then(|| { Arc::new(AdmissionController::new( probe, diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs index bd1ead3243..69dd1c769f 100644 --- a/golem-worker-executor/src/worker/mod.rs +++ b/golem-worker-executor/src/worker/mod.rs @@ -1008,6 +1008,7 @@ impl Worker { self.granted_memory.fetch_add(delta, Ordering::Relaxed); Ok(()) } else { + crate::metrics::workers::record_worker_memory_grow_rejected(); Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory)) } } From 7eb6f08fcaf80d02a1d1134d2c81b908d58a5f66 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Thu, 11 Jun 2026 01:12:43 -0700 Subject: [PATCH 43/60] fix: make admission gate reserve atomic to prevent ceiling overshoot --- .../services/active_workers/admission/mod.rs | 37 ++++++-- .../active_workers/admission/tests.rs | 85 +++++++++++++++++++ 2 files changed, 115 insertions(+), 7 deletions(-) diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs index 0008f66773..e9ca7f7079 100644 --- a/golem-worker-executor/src/services/active_workers/admission/mod.rs +++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs @@ -134,14 +134,38 @@ impl AdmissionController { /// not cover), so taking the maximum keeps the gate safe against both the /// grant a worker may yet fault in and any usage the grant does not capture. fn admissible_headroom(&self) -> u64 { + let granted = *self.granted.lock().unwrap(); + self.headroom_with_granted(granted) + } + + /// Computes admissible headroom for an already-read `granted` value. Reads + /// the probe and emits the ceiling/RSS metrics. Kept separate from the lock + /// acquisition so the decision-and-reserve sequence can hold the lock across + /// both steps (see [`Self::try_reserve_locked`]). + fn headroom_with_granted(&self, granted: u64) -> u64 { let snapshot = self.probe.snapshot(); let ceiling = (snapshot.limit_bytes as f64 * self.policy.usable_ratio) as u64; - let granted = *self.granted.lock().unwrap(); crate::metrics::workers::record_worker_memory_ceiling(ceiling); crate::metrics::workers::record_worker_admission_rss(snapshot.current_bytes); ceiling.saturating_sub(snapshot.current_bytes.max(granted)) } + /// Atomically admits `request_bytes` if the headroom computed against the + /// current granted total covers it: reads `granted`, computes headroom, and + /// adds the reservation all under one lock so two concurrent admissions + /// cannot both pass the check against the same headroom and overshoot the + /// ceiling. Returns whether the request was admitted. + fn try_reserve_locked(&self, request_bytes: u64) -> bool { + let mut granted = self.granted.lock().unwrap(); + if self.headroom_with_granted(*granted) >= request_bytes { + *granted += request_bytes; + crate::metrics::workers::record_worker_memory_granted(*granted); + true + } else { + false + } + } + /// Record `request_bytes` of memory granted to a newly admitted worker. The /// gate reserves this until the worker unloads, because the worker may fault /// the granted pages in at any later time. @@ -192,9 +216,8 @@ impl AdmissionController { request_bytes: u64, source: &dyn EvictionSource, ) -> AdmissionDecision { - // Fast path: enough real headroom already, admit without evicting. - if self.admissible_headroom() >= request_bytes { - self.reserve(request_bytes); + // Fast path: atomically admit if there is already enough real headroom. + if self.try_reserve_locked(request_bytes) { return AdmissionDecision::Admit; } @@ -212,9 +235,9 @@ impl AdmissionController { // Re-measure against ground truth rather than trusting the freed tally: // the probe is the authority, and other activity may have moved usage - // in either direction while we were evicting. - if self.admissible_headroom() >= request_bytes { - self.reserve(request_bytes); + // in either direction while we were evicting. The check-and-reserve is + // atomic so a concurrent admission cannot slip in between. + if self.try_reserve_locked(request_bytes) { AdmissionDecision::Admit } else { AdmissionDecision::Reject diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs index 24e9b3e119..6f263930b3 100644 --- a/golem-worker-executor/src/services/active_workers/admission/tests.rs +++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs @@ -254,6 +254,91 @@ async fn apply_staggered_admit( decision } +/// A probe with a fixed limit that always reports zero current usage, so the +/// gate's admission decision is driven solely by the granted accounting against +/// the ceiling. Used by the concurrency test, where the property under test is +/// that the granted counter cannot be over-committed by racing admissions. +#[derive(Debug)] +struct ZeroUsageProbe { + limit: u64, +} + +impl MemoryProbe for ZeroUsageProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.limit, + current_bytes: 0, + } + } +} + +/// An eviction source with nothing to evict: a rejected request stays rejected. +struct NoEvictionSource; + +#[async_trait::async_trait] +impl EvictionSource for NoEvictionSource { + async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 { + 0 + } +} + +/// Concurrent admissions must never grant more than the ceiling allows. +/// +/// Many admit attempts of equal size race against a controller whose ceiling +/// admits only a known number of them, with no evictable work to fall back on. +/// Exactly `ceiling / request` requests must be admitted and the rest rejected; +/// the total granted must never exceed the ceiling. This can only hold if each +/// admission's "is there room? then reserve" sequence is atomic against the +/// others — if two admits read the same headroom before either reserves, both +/// pass and the granted total overshoots the ceiling. +#[test] +fn concurrent_admissions_never_overcommit_the_ceiling() { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(8) + .build() + .unwrap(); + + rt.block_on(async { + const REQUEST: u64 = 10; + const CAPACITY: u64 = 50; // exactly 5 requests fit + const ATTEMPTS: usize = 200; // far more than fit, all racing + + let controller = Arc::new(AdmissionController::new( + Box::new(ZeroUsageProbe { limit: CAPACITY }), + AdmissionPolicy { usable_ratio: 1.0 }, + )); + + let mut handles = Vec::with_capacity(ATTEMPTS); + for _ in 0..ATTEMPTS { + let controller = controller.clone(); + handles.push(tokio::spawn(async move { + controller.try_admit(REQUEST, &NoEvictionSource).await + })); + } + + let mut admitted = 0usize; + for handle in handles { + if handle.await.unwrap() == AdmissionDecision::Admit { + admitted += 1; + } + } + + let expected = (CAPACITY / REQUEST) as usize; + assert_eq!( + admitted, expected, + "expected exactly {expected} admissions to fit, got {admitted}" + ); + // With zero measured usage, headroom is the ceiling minus granted; if it + // equals the full ceiling again, everything admitted was released, which + // never happens here. The decisive check: the admitted total fits. + assert!( + admitted as u64 * REQUEST <= CAPACITY, + "granted {} exceeded ceiling {CAPACITY}", + admitted as u64 * REQUEST + ); + }); +} + // ── Single-case unit tests ─────────────────────────────────────────────────── #[test] From 83a6b2f071cfc518935b8e890a8b49f951e1ced6 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Thu, 11 Jun 2026 02:04:43 -0700 Subject: [PATCH 44/60] test: gate concurrent-agent permit tests with a semaphore, not Notify --- .../tests/resource_limits.rs | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/golem-worker-executor/tests/resource_limits.rs b/golem-worker-executor/tests/resource_limits.rs index 58377cba3b..a816beb39e 100644 --- a/golem-worker-executor/tests/resource_limits.rs +++ b/golem-worker-executor/tests/resource_limits.rs @@ -186,11 +186,14 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish( let context = TestContext::new(last_unique_id); let executor = start_with_concurrent_agent_limit(deps, &context, 1).await?; - // HTTP server that gates its /poll response behind a Notify. + // HTTP server that gates its /poll response behind a zero-permit semaphore. // HttpClient2.start_polling polls GET /poll until the body equals "done". - // By holding the Notify unreleased we keep a1 in the Running state - // for as long as needed, preventing eviction and holding the only permit. - let gate = std::sync::Arc::new(tokio::sync::Notify::new()); + // The handler blocks acquiring a permit, so by withholding the permit we keep + // a1 in the Running state for as long as needed, preventing eviction and + // holding the only permit. A semaphore is used rather than a Notify so the + // release is not sensitive to whether the request's waiter is registered + // before the release call. + let gate = std::sync::Arc::new(tokio::sync::Semaphore::new(0)); let gate_clone = gate.clone(); let listener = tokio::net::TcpListener::bind("0.0.0.0:0").await?; let port = listener.local_addr()?.port(); @@ -200,7 +203,10 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish( get(move || { let gate = gate_clone.clone(); async move { - gate.notified().await; + gate.acquire() + .await + .expect("gate semaphore closed") + .forget(); "done".to_string() } }), @@ -259,7 +265,7 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish( // Release the gate — a1's poll loop returns "done", its invocation // completes, and its permit is returned to the semaphore via Drop. // This unblocks a2 from WaitingForPermit. - gate.notify_waiters(); + gate.add_permits(1); // Wait for a1 to become Idle (invocation done, permit released). executor @@ -320,7 +326,13 @@ async fn concurrent_agent_idle_releases_permit( let executor = start_with_concurrent_agent_limit(deps, &context, 1).await?; // --- HTTP gate: keeps a1 provably Running until we release it. --- - let gate = std::sync::Arc::new(tokio::sync::Notify::new()); + // A zero-permit semaphore is used rather than a Notify so the release is not + // sensitive to whether the request's waiter is registered before the release + // call: a permit added before the handler reaches `acquire` is simply waiting + // for it. The handler blocks on `acquire` and only returns once the test adds + // a permit, so a1 stays Running (blocked in /poll) until then regardless of + // how the runner schedules the tasks. + let gate = std::sync::Arc::new(tokio::sync::Semaphore::new(0)); let gate_clone = gate.clone(); let listener = tokio::net::TcpListener::bind("0.0.0.0:0").await?; let port = listener.local_addr()?.port(); @@ -330,7 +342,12 @@ async fn concurrent_agent_idle_releases_permit( get(move || { let gate = gate_clone.clone(); async move { - gate.notified().await; + // Consume one permit permanently so a single added permit + // releases exactly one poll, not a recycled one. + gate.acquire() + .await + .expect("gate semaphore closed") + .forget(); "done".to_string() } }), @@ -387,7 +404,7 @@ async fn concurrent_agent_idle_releases_permit( // Release the gate. a1's poll returns "done", invocation completes, a1 goes Idle. // With the fix: Idle transition drops the permit → semaphore notifies a2 → a2 starts. // With the bug: a1 stays Idle but holds permit → a2 remains blocked forever. - gate.notify_waiters(); + gate.add_permits(1); // a2 should now be unblocked (fix) or remain stuck (bug). // Give it 15 seconds — well beyond what starting a counter agent takes. From 24673f6f6975af7abe4dfc9d3970122f19e57e59 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Thu, 11 Jun 2026 15:53:07 -0700 Subject: [PATCH 45/60] feat: expose tokio metrics --- golem-worker-executor/src/lib.rs | 4 ++ golem-worker-executor/src/metrics.rs | 93 ++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs index 8b05e6e71c..377123ddf7 100644 --- a/golem-worker-executor/src/lib.rs +++ b/golem-worker-executor/src/lib.rs @@ -1061,6 +1061,10 @@ pub async fn bootstrap_and_run_worker_executor< let leak_detector = worker_executor_impl.leak_detector(); + join_set.spawn(crate::metrics::runtime::run_runtime_metrics_loop( + runtime.clone(), + )); + let grpc_port = run_grpc_server(worker_executor_impl, lazy_worker_activator, join_set).await?; let http_port = golem_service_base::observability::start_health_and_metrics_server( diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs index 0e009c0705..237c4cb0f4 100644 --- a/golem-worker-executor/src/metrics.rs +++ b/golem-worker-executor/src/metrics.rs @@ -113,6 +113,99 @@ pub mod component { } } +pub mod runtime { + use std::time::{Duration, Instant}; + + use lazy_static::lazy_static; + use prometheus::*; + use tokio::runtime::Handle; + + lazy_static! { + /// Number of tasks currently sitting in the tokio runtime's global + /// (injection) queue: runnable but not yet polled by any worker thread. + /// A persistently non-zero value means ready tasks (including I/O + /// continuations such as DB-response handling) are waiting for a worker + /// thread, which inflates I/O latency metrics even when the underlying + /// I/O is fast. + static ref GLOBAL_QUEUE_DEPTH: IntGauge = register_int_gauge!( + "executor_tokio_global_queue_depth", + "Tasks scheduled in the tokio runtime global queue, runnable but not yet polled" + ) + .unwrap(); + + /// Number of worker threads in the multi-thread runtime. + static ref NUM_WORKERS: IntGauge = register_int_gauge!( + "executor_tokio_num_workers", + "Number of tokio runtime worker threads" + ) + .unwrap(); + + /// Current number of alive tasks in the runtime. + static ref NUM_ALIVE_TASKS: IntGauge = register_int_gauge!( + "executor_tokio_num_alive_tasks", + "Number of alive tasks in the tokio runtime" + ) + .unwrap(); + + /// Per-worker busy ratio over the last sampling interval: the fraction + /// of wall-clock time the worker spent executing tasks. A value near 1.0 + /// means the worker is CPU-saturated and cannot promptly poll newly + /// ready tasks. + static ref WORKER_BUSY_RATIO: GaugeVec = register_gauge_vec!( + "executor_tokio_worker_busy_ratio", + "Fraction of wall-clock time each tokio worker spent busy over the sampling interval", + &["worker"] + ) + .unwrap(); + } + + /// Background loop that samples stable tokio runtime metrics and exports them + /// to Prometheus. + /// + /// All metrics used here are stable as of tokio 1.45 (the workspace resolves + /// 1.50+), so this requires neither the `tokio_unstable` cfg nor any build + /// flag. `global_queue_depth` is the primary diagnostic for runtime + /// scheduling pressure; `worker_busy_ratio` corroborates it by showing + /// per-worker CPU saturation. Never returns. + pub async fn run_runtime_metrics_loop(handle: Handle) -> anyhow::Result<()> { + const INTERVAL: Duration = Duration::from_secs(5); + + let metrics = handle.metrics(); + let num_workers = metrics.num_workers(); + NUM_WORKERS.set(num_workers as i64); + + // Previous cumulative busy duration per worker, for computing the busy + // ratio over each interval. + let mut prev_busy: Vec = (0..num_workers) + .map(|w| metrics.worker_total_busy_duration(w)) + .collect(); + let mut prev_instant = Instant::now(); + + let mut interval = tokio::time::interval(INTERVAL); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + loop { + interval.tick().await; + + GLOBAL_QUEUE_DEPTH.set(metrics.global_queue_depth() as i64); + NUM_ALIVE_TASKS.set(metrics.num_alive_tasks() as i64); + + let now = Instant::now(); + let elapsed = now.duration_since(prev_instant).as_secs_f64(); + prev_instant = now; + if elapsed > 0.0 { + for (w, prev) in prev_busy.iter_mut().enumerate() { + let busy = metrics.worker_total_busy_duration(w); + let delta = busy.saturating_sub(*prev).as_secs_f64(); + *prev = busy; + WORKER_BUSY_RATIO + .with_label_values(&[&w.to_string()]) + .set((delta / elapsed).min(1.0)); + } + } + } + } +} + pub mod events { use lazy_static::lazy_static; use prometheus::*; From a1928c56311def1098c72101c54706999b2894c8 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:40:24 -0700 Subject: [PATCH 46/60] fix: prevent concurrent-agent scheduler deadlock on cancel-after-grant --- .../services/active_workers/tests.txt | 7 + .../concurrent_agents_scheduler.rs | 148 ++++++++---- .../src/services/active_workers/tests.rs | 220 ++++++++++++++++++ 3 files changed, 330 insertions(+), 45 deletions(-) create mode 100644 golem-worker-executor/proptest-regressions/services/active_workers/tests.txt diff --git a/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt b/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt new file mode 100644 index 0000000000..5845bf0e72 --- /dev/null +++ b/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 25407766c98e9d718173e44b5321f97049eea6d6d7737aad80a937d7230d67d9 # shrinks to limit = 1, ops = [Acquire, Acquire, CancelPending(Index(423873604949)), Acquire, ReleaseThenCancel(Index(2899867607303593255), Index(13233034632676646474))] diff --git a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs index 77c3f74b86..2391fce697 100644 --- a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs +++ b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs @@ -48,42 +48,95 @@ struct AccountSchedulerState { struct QueuedAgent { agent_id: AgentId, - waker: tokio::sync::oneshot::Sender, + waker: tokio::sync::oneshot::Sender, } -/// RAII permit returned by [`ConcurrentAgentsScheduler::acquire`]. +/// A slot granted from the scheduler: owns the underlying semaphore permit and +/// the responsibility to decrement the account's `running_count` and wake the +/// next queued agent when it is released. /// -/// On drop, decrements the account's running count and wakes the next queued -/// agent (if any). The drop handler is fully synchronous. -pub struct ConcurrentAgentPermit { +/// Crucially, the `running_count` was incremented *together with* acquiring the +/// raw permit, and the matching decrement lives **only** here in `Drop`. This +/// binds the count strictly to the lifetime of the granted permit, regardless +/// of how the slot is ultimately disposed of: +/// +/// * It is moved into a [`ConcurrentAgentPermit`] and dropped when the agent +/// releases the slot (the normal case), or +/// * it is sent into a queued waiter's oneshot and that waiter is cancelled +/// before receiving it — the slot is then dropped *inside* the channel. +/// +/// Both paths run this same `Drop`, so a slot granted to a waiter that is +/// cancelled after the grant succeeded cannot leak the count. (A previous +/// design decremented only when the oneshot `send` failed, which left +/// `running_count` permanently inflated when a waiter was cancelled *after* a +/// successful send — wedging the whole account once the count reached the +/// limit.) +struct GrantedSlot { raw: Option, - account: Option>, + account: Arc, account_id: AccountId, } -impl Drop for ConcurrentAgentPermit { +impl Drop for GrantedSlot { fn drop(&mut self) { if let Some(raw) = self.raw.take() { // Return the raw permit to the semaphore first so it is available // for the next queued agent's synchronous try-acquire. drop(raw); - - if let Some(ref account) = self.account { - try_grant_next_sync(account, &self.account_id); - } + try_grant_next_sync(&self.account, &self.account_id); } } } -impl ConcurrentAgentPermit { - /// Consumes the permit without triggering the drop notification. - #[allow(dead_code)] - pub fn into_inner(mut self) -> Option { - self.account = None; +impl GrantedSlot { + /// Take the raw permit out, suppressing this slot's `Drop` bookkeeping. + /// + /// Used only from `drain_ready_queue` when a `send` to a cancelled waiter + /// fails: the slot is returned in the `Err`, but we are still holding the + /// account state lock, so letting its `Drop` run would re-enter + /// `try_grant_next_sync` and deadlock on the same non-reentrant mutex. The + /// caller takes the permit back and performs the accounting inline instead. + fn defuse(mut self) -> Option { self.raw.take() } } +/// RAII permit returned by [`ConcurrentAgentsScheduler::acquire`]. +/// +/// On drop, decrements the account's running count and wakes the next queued +/// agent (if any) via the held [`GrantedSlot`]. Unlimited accounts hold a bare +/// permit with no slot, so dropping them touches no scheduler accounting. The +/// drop handler is fully synchronous. +pub struct ConcurrentAgentPermit { + /// `Some` for limited accounts (carries the scheduler accounting); `None` + /// for unlimited accounts, where `_raw` holds the bare bypass permit. Held + /// purely for its `Drop`, which returns the permit and wakes the next + /// queued agent. + _slot: Option, + /// Bare permit for the unlimited-account bypass path. Unused for limited + /// accounts (the permit lives inside `_slot`). + _raw: Option, +} + +impl ConcurrentAgentPermit { + /// A permit for a limited account, carrying the scheduler accounting. + fn from_slot(slot: GrantedSlot) -> Self { + Self { + _slot: Some(slot), + _raw: None, + } + } + + /// A permit for an unlimited account: a bare bypass permit with no + /// scheduler accounting. + fn unlimited(raw: OwnedSemaphorePermit) -> Self { + Self { + _slot: None, + _raw: Some(raw), + } + } +} + impl Default for ConcurrentAgentsScheduler { fn default() -> Self { Self::new() @@ -156,11 +209,7 @@ impl ConcurrentAgentsScheduler { // Unlimited accounts bypass the queue entirely. if is_unlimited(limit) { let raw = self.permits.acquire(account_id, || async { false }).await; - return ConcurrentAgentPermit { - raw: Some(raw), - account: None, - account_id, - }; + return ConcurrentAgentPermit::unlimited(raw); } // Sync the underlying semaphore pool size with the current plan limit @@ -175,16 +224,12 @@ impl ConcurrentAgentsScheduler { let limit = account.resource_entry.max_concurrent_agents_per_executor(); if is_unlimited(limit) { let raw = self.permits.acquire(account_id, || async { false }).await; - return ConcurrentAgentPermit { - raw: Some(raw), - account: None, - account_id, - }; + return ConcurrentAgentPermit::unlimited(raw); } enum AcquireDecision { FastPath(OwnedSemaphorePermit), - Queued(tokio::sync::oneshot::Receiver), + Queued(tokio::sync::oneshot::Receiver), } let decision = { @@ -197,7 +242,7 @@ impl ConcurrentAgentsScheduler { // After a plan upgrade, newly added semaphore permits may allow // queued agents to proceed. Drain what we can before deciding // about the current agent. - drain_ready_queue(&mut state, &account.raw_semaphore, limit, &account_id); + drain_ready_queue(&mut state, &account, limit, &account_id); // Fast path: capacity available, no older waiters, and the raw // semaphore actually has a permit. We try-acquire the semaphore @@ -239,26 +284,22 @@ impl ConcurrentAgentsScheduler { "ConcurrentAgentsScheduler: fast-path permit for {agent_id} in account {account_id}" ); - ConcurrentAgentPermit { + ConcurrentAgentPermit::from_slot(GrantedSlot { raw: Some(raw), - account: Some(account), + account, account_id, - } + }) } AcquireDecision::Queued(rx) => { debug!( "ConcurrentAgentsScheduler: {agent_id} queued in account {account_id}, waiting for permit" ); - let raw = rx.await.expect( + let slot = rx.await.expect( "ConcurrentAgentsScheduler: oneshot sender dropped without sending — scheduler bug", ); - ConcurrentAgentPermit { - raw: Some(raw), - account: Some(account), - account_id, - } + ConcurrentAgentPermit::from_slot(slot) } } } @@ -299,7 +340,7 @@ impl ConcurrentAgentsScheduler { /// be fully synchronous. Uses `tokio::sync::Semaphore::try_acquire_owned` /// (which is synchronous despite being on a tokio type) to acquire permits /// for queued agents. -fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) { +fn try_grant_next_sync(account: &Arc, account_id: &AccountId) { let limit = account.resource_entry.max_concurrent_agents_per_executor(); if is_unlimited(limit) { return; @@ -308,7 +349,7 @@ fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) { let mut state = account.state.lock().unwrap(); state.running_count = state.running_count.saturating_sub(1); - drain_ready_queue(&mut state, &account.raw_semaphore, limit, account_id); + drain_ready_queue(&mut state, account, limit, account_id); } /// Try to grant permits to queued agents from the front of the ready queue. @@ -316,9 +357,15 @@ fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) { /// Called both from `try_grant_next_sync` (Drop path) and from `acquire` /// (after a plan-upgrade sync adds new permits). Fully synchronous — only /// uses `try_acquire_owned` which does not block. +/// +/// Each granted permit is wrapped in a [`GrantedSlot`] carrying the +/// `running_count` decrement, so a waiter cancelled *after* a successful send +/// still releases its slot (via the slot's `Drop` when the oneshot channel is +/// dropped) rather than leaking the count. The increment here is matched +/// one-for-one by that slot's `Drop`. fn drain_ready_queue( state: &mut AccountSchedulerState, - raw_semaphore: &Arc, + account: &Arc, limit: u64, account_id: &AccountId, ) { @@ -326,13 +373,24 @@ fn drain_ready_queue( let queued = state.ready_queue.pop_front().unwrap(); // tokio::sync::Semaphore::try_acquire_owned is synchronous. - match raw_semaphore.clone().try_acquire_owned() { + match account.raw_semaphore.clone().try_acquire_owned() { Ok(raw) => { state.running_count += 1; - if queued.waker.send(raw).is_err() { - // Waiter was cancelled; the permit inside the oneshot - // is dropped, returning it to the semaphore. Decrement - // and try next. + let slot = GrantedSlot { + raw: Some(raw), + account: account.clone(), + account_id: *account_id, + }; + if let Err(slot) = queued.waker.send(slot) { + // Waiter was cancelled before we could hand it the slot. + // We are still holding the state lock, so we must not let + // the returned slot's `Drop` run (it would re-enter this + // path via `try_grant_next_sync` and deadlock). Defuse it, + // return its permit to the semaphore, and account for it + // inline, then try the next queued agent. + if let Some(raw) = slot.defuse() { + drop(raw); + } state.running_count -= 1; debug!( "ConcurrentAgentsScheduler: waiter {} cancelled in account {account_id}, trying next", diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs index 53481b4d18..1f6c8313cf 100644 --- a/golem-worker-executor/src/services/active_workers/tests.rs +++ b/golem-worker-executor/src/services/active_workers/tests.rs @@ -840,3 +840,223 @@ mod component_module_charge { ); } } + +// ── ConcurrentAgentsScheduler — model-based liveness property ──────────────── +// +// The scheduler keeps its own `running_count` integer alongside the real tokio +// semaphore permits. The two must stay in lockstep: every increment of +// `running_count` must be matched by exactly one decrement, regardless of how a +// granted slot is disposed of (released by a live worker, or dropped inside a +// cancelled waiter's oneshot channel). If they drift, the scheduler wedges — +// `running_count` sticks at the limit while permits are actually free, and +// every future acquire queues forever. This is the production deadlock the +// property is designed to catch. +// +// The model drives random interleavings of acquire / release / cancel against +// the real scheduler and, after every step, asserts the *liveness* invariant: +// whenever fewer permits are genuinely held than the limit allows, a fresh +// acquire must succeed promptly. A leaked `running_count` violates this. +mod scheduler_liveness { + use super::super::concurrent_agents_scheduler::{ + ConcurrentAgentPermit, ConcurrentAgentsScheduler, + }; + use super::{account, agent, resource_entry_with_agent_limit}; + use proptest::prelude::*; + use std::sync::Arc; + use std::time::Duration; + use test_r::test; + use tokio::task::JoinHandle; + + /// One step in a randomized scheduler workload. + #[derive(Debug, Clone)] + enum Op { + /// Acquire a permit and hold it (resolves immediately if capacity is + /// free, otherwise the in-flight acquire is parked in `pending`). + Acquire, + /// Release a currently-held permit, if any. + Release(prop::sample::Index), + /// Cancel an in-flight (likely queued) acquire, if any. Exercises both + /// "cancelled while queued" and "cancelled just after being granted". + CancelPending(prop::sample::Index), + /// Release a held permit and, in the same step, cancel an in-flight + /// acquire. This is the deadly race: the released slot may be granted + /// to the in-flight acquire's oneshot and then the acquire is cancelled + /// before it can receive it. The slot must still be released. + ReleaseThenCancel(prop::sample::Index, prop::sample::Index), + } + + fn arb_ops() -> impl Strategy> { + prop::collection::vec( + prop_oneof![ + 3 => Just(Op::Acquire), + 2 => any::().prop_map(Op::Release), + 2 => any::().prop_map(Op::CancelPending), + 3 => (any::(), any::()) + .prop_map(|(a, b)| Op::ReleaseThenCancel(a, b)), + ], + 1..60, + ) + } + + /// Let any synchronous grant/drain bookkeeping triggered by a release or + /// cancellation settle before the next observation. + async fn settle() { + for _ in 0..8 { + tokio::task::yield_now().await; + } + tokio::time::sleep(Duration::from_millis(1)).await; + } + + proptest! { + // Cap shrink iterations so a failing (buggy) run cannot spend minutes + // re-running wedging inputs against the overall timeout while shrinking. + #![proptest_config(ProptestConfig { cases: 128, max_shrink_iters: 64, ..ProptestConfig::default() })] + + /// Liveness: under any interleaving of acquire / release / cancel, the + /// scheduler never wedges. After each step, if fewer permits are held + /// than the limit, a fresh acquire must succeed within a short timeout. + /// At the end, draining all held permits must let the account return to + /// full capacity. + #[test] + fn scheduler_never_wedges_under_churn( + limit in 1usize..6, + ops in arb_ops(), + ) { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_time() + .build() + .unwrap(); + + rt.block_on(async move { + // Bound the whole case so a wedge fails fast and deterministically + // rather than hanging the test suite. A correct scheduler completes + // a 60-op workload in well under a second; the bug deadlocks here, + // so a tight bound makes the failure (and any shrinking) quick. + let outcome = tokio::time::timeout(Duration::from_secs(3), async move { + run_workload(limit, ops).await + }) + .await; + + match outcome { + Ok(result) => result, + Err(_elapsed) => Err(TestCaseError::fail( + "scheduler workload did not complete within the overall timeout — \ + deadlock (running_count leaked above true occupancy)", + )), + } + })?; + } + } + + /// Drives one randomized workload against a freshly-registered account and + /// returns `Err` if the liveness invariant is ever violated. Factored out of + /// the proptest body so the whole run can be wrapped in an overall timeout. + async fn run_workload(limit: usize, ops: Vec) -> Result<(), TestCaseError> { + // Short per-acquire timeout: a wedge must surface quickly, but allow + // enough slack for genuine multi-thread scheduling jitter. + const PROBE_TIMEOUT: Duration = Duration::from_millis(500); + + let sched = Arc::new(ConcurrentAgentsScheduler::new()); + let acc = account(); + sched + .register_account(acc, resource_entry_with_agent_limit(limit as u64)) + .await; + + // Permits we are deliberately holding (count against the limit). + let mut held: Vec = Vec::new(); + // In-flight acquires not yet resolved (queued or just granted). + let mut pending: Vec> = Vec::new(); + let mut counter = 0usize; + + for op in ops { + match op { + Op::Acquire => { + counter += 1; + let sched = sched.clone(); + let name = format!("W{counter}"); + let handle = + tokio::spawn(async move { sched.acquire(acc, agent(&name)).await }); + pending.push(handle); + } + Op::Release(idx) => { + if !held.is_empty() { + let i = idx.index(held.len()); + drop(held.remove(i)); + } + } + Op::CancelPending(idx) => { + if !pending.is_empty() { + let i = idx.index(pending.len()); + pending.remove(i).abort(); + } + } + Op::ReleaseThenCancel(ri, ci) => { + if !held.is_empty() { + let i = ri.index(held.len()); + drop(held.remove(i)); + } + if !pending.is_empty() { + let i = ci.index(pending.len()); + pending.remove(i).abort(); + } + } + } + + settle().await; + + // Collect any in-flight acquires that have now resolved into + // held permits, so `held.len()` reflects true occupancy. + let mut still_pending = Vec::new(); + for h in pending.drain(..) { + if h.is_finished() { + if let Ok(permit) = h.await { + held.push(permit); + } + // Cancelled/aborted handles are simply dropped. + } else { + still_pending.push(h); + } + } + pending = still_pending; + + // Liveness invariant: if we are below the limit, a fresh + // acquire must succeed promptly. A leaked running_count + // would make this hang and trip the timeout. + if held.len() < limit { + let probe = + tokio::time::timeout(PROBE_TIMEOUT, sched.acquire(acc, agent("probe"))).await; + prop_assert!( + probe.is_ok(), + "scheduler wedged: held {} < limit {} but acquire timed out", + held.len(), + limit, + ); + // Release the probe immediately. + drop(probe.ok()); + settle().await; + } + } + + // Abort everything still queued, drop all held permits, and + // confirm the account drains back to full capacity: `limit` + // fresh acquires must all succeed. + for h in pending.drain(..) { + h.abort(); + let _ = h.await; + } + held.clear(); + settle().await; + + let mut drained = Vec::new(); + for _ in 0..limit { + let p = tokio::time::timeout(PROBE_TIMEOUT, sched.acquire(acc, agent("drain"))).await; + prop_assert!( + p.is_ok(), + "scheduler did not return to full capacity after churn", + ); + drained.push(p.unwrap()); + } + Ok(()) + } +} From 183de28520b2fa46a0056558136f80ece917d5ca Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:45:14 -0700 Subject: [PATCH 47/60] feat: use official tokio-metrics crate to expose tokio runtime metrics --- golem-debugging-service/src/config.rs | 1 + golem-service-base/src/observability.rs | 29 +++- golem-worker-executor/Cargo.toml | 3 + .../config/worker-executor.sample.env | 3 + .../config/worker-executor.toml | 3 + golem-worker-executor/src/identity.rs | 32 ++++ golem-worker-executor/src/lib.rs | 10 +- golem-worker-executor/src/metrics.rs | 164 ++++++++---------- .../src/services/golem_config.rs | 6 + 9 files changed, 153 insertions(+), 98 deletions(-) create mode 100644 golem-worker-executor/src/identity.rs diff --git a/golem-debugging-service/src/config.rs b/golem-debugging-service/src/config.rs index dc6299652b..6a9869550b 100644 --- a/golem-debugging-service/src/config.rs +++ b/golem-debugging-service/src/config.rs @@ -98,6 +98,7 @@ impl DebugConfig { max_in_function_retry_delay: std::time::Duration::from_secs(20), max_websocket_connections: 100, quota_service: QuotaServiceConfig::default(), + runtime_metrics_sampling_interval: std::time::Duration::from_secs(5), } } } diff --git a/golem-service-base/src/observability.rs b/golem-service-base/src/observability.rs index 98a83dd36e..f9f54b554c 100644 --- a/golem-service-base/src/observability.rs +++ b/golem-service-base/src/observability.rs @@ -18,21 +18,42 @@ use axum::response::IntoResponse; use axum::routing::get; use http::Response; use prometheus::{Encoder, Registry, TextEncoder}; +use std::sync::Arc; use tokio::net::{TcpListener, ToSocketAddrs}; use tokio::task::JoinSet; use tracing::{Instrument, info}; +/// A callback that renders additional metrics in Prometheus text exposition +/// format, appended to the output of the `prometheus`-crate registry on the +/// `/metrics` endpoint. Used to surface metrics from a second metrics façade +/// (e.g. the `metrics`-crate recorder driving tokio-metrics) on the same +/// scrape endpoint. +pub type ExtraMetrics = Arc String + Send + Sync>; + pub async fn start_health_and_metrics_server( addr: impl ToSocketAddrs, registry: Registry, body_message: &'static str, join_set: &mut JoinSet>, +) -> Result { + start_health_and_metrics_server_with_extra(addr, registry, None, body_message, join_set).await +} + +pub async fn start_health_and_metrics_server_with_extra( + addr: impl ToSocketAddrs, + registry: Registry, + extra: Option, + body_message: &'static str, + join_set: &mut JoinSet>, ) -> Result { let app = Router::new() .route("/healthcheck", get(move || async move { body_message })) .route( "/metrics", - get(|| async move { prometheus_metrics(registry.clone()) }), + get(move || { + let extra = extra.clone(); + async move { prometheus_metrics(registry.clone(), extra) } + }), ); let listener = TcpListener::bind(addr).await?; @@ -51,13 +72,17 @@ pub async fn start_health_and_metrics_server( Ok(local_addr.port()) } -pub fn prometheus_metrics(registry: Registry) -> impl IntoResponse { +pub fn prometheus_metrics(registry: Registry, extra: Option) -> impl IntoResponse { let encoder = TextEncoder::new(); let mut buffer = Vec::new(); let metric_families = registry.gather(); encoder.encode(&metric_families, &mut buffer).unwrap(); + if let Some(extra) = extra { + buffer.extend_from_slice(extra().as_bytes()); + } + Response::builder() .header("Content-Type", encoder.format_type()) .body(Body::from(buffer)) diff --git a/golem-worker-executor/Cargo.toml b/golem-worker-executor/Cargo.toml index bfe2dcafe8..2aa2f37532 100644 --- a/golem-worker-executor/Cargo.toml +++ b/golem-worker-executor/Cargo.toml @@ -73,6 +73,8 @@ lazy_static = { workspace = true } log = { workspace = true } mac_address = { workspace = true, features = ["serde"] } md5 = { workspace = true } +metrics = { workspace = true } +metrics-exporter-prometheus = { workspace = true } metrohash = { workspace = true } nonempty-collections = { workspace = true } nonzero_ext = { workspace = true } @@ -92,6 +94,7 @@ sqlx-core = { workspace = true } sysinfo = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } +tokio-metrics = { workspace = true } tokio-stream = { workspace = true } tokio-tungstenite = { workspace = true } tokio-util = { workspace = true } diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env index d3c7a04559..35725ab38f 100644 --- a/golem-worker-executor/config/worker-executor.sample.env +++ b/golem-worker-executor/config/worker-executor.sample.env @@ -4,6 +4,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0" GOLEM__HTTP_PORT=8082 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s" GOLEM__MAX_WEBSOCKET_CONNECTIONS=100 +GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s" GOLEM__TRACING_FILE_NAME_WITH_PORT=true GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25 GOLEM__ACTIVE_WORKERS__TTL="8h" @@ -215,6 +216,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0" GOLEM__HTTP_PORT=8082 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s" GOLEM__MAX_WEBSOCKET_CONNECTIONS=100 +GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s" GOLEM__TRACING_FILE_NAME_WITH_PORT=true GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25 GOLEM__ACTIVE_WORKERS__TTL="8h" @@ -436,6 +438,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0" GOLEM__HTTP_PORT=8082 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s" GOLEM__MAX_WEBSOCKET_CONNECTIONS=100 +GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s" GOLEM__TRACING_FILE_NAME_WITH_PORT=true GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25 GOLEM__ACTIVE_WORKERS__TTL="8h" diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml index e77c5f9bfa..df58d45b50 100644 --- a/golem-worker-executor/config/worker-executor.toml +++ b/golem-worker-executor/config/worker-executor.toml @@ -3,6 +3,7 @@ http_address = "0.0.0.0" http_port = 8082 max_in_function_retry_delay = "20s" max_websocket_connections = 100 +runtime_metrics_sampling_interval = "5s" tracing_file_name_with_port = true [active_workers] @@ -333,6 +334,7 @@ without_time = false # http_port = 8082 # max_in_function_retry_delay = "20s" # max_websocket_connections = 100 +# runtime_metrics_sampling_interval = "5s" # tracing_file_name_with_port = true # # [active_workers] @@ -665,6 +667,7 @@ without_time = false # http_port = 8082 # max_in_function_retry_delay = "20s" # max_websocket_connections = 100 +# runtime_metrics_sampling_interval = "5s" # tracing_file_name_with_port = true # # [active_workers] diff --git a/golem-worker-executor/src/identity.rs b/golem-worker-executor/src/identity.rs new file mode 100644 index 0000000000..e2f95b0cae --- /dev/null +++ b/golem-worker-executor/src/identity.rs @@ -0,0 +1,32 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Process/pod identity for this worker-executor instance. +//! +//! The identity is derived from the `POD_NAME` env var, falling back to +//! `HOSTNAME`, then `"unknown"`, resolved once and cached for the lifetime of +//! the process. It is used both as the `executor_id` metric label and anywhere +//! else the running instance needs to identify itself. + +/// Returns the stable identity of this worker-executor instance. +/// +/// Resolved once on first call and cached for the lifetime of the process. +pub fn executor_id() -> &'static str { + static EXECUTOR_ID: std::sync::OnceLock = std::sync::OnceLock::new(); + EXECUTOR_ID.get_or_init(|| { + std::env::var("POD_NAME") + .or_else(|_| std::env::var("HOSTNAME")) + .unwrap_or_else(|_| "unknown".to_string()) + }) +} diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs index 377123ddf7..b9ecc4f640 100644 --- a/golem-worker-executor/src/lib.rs +++ b/golem-worker-executor/src/lib.rs @@ -16,6 +16,7 @@ pub mod bootstrap; pub mod config; pub mod durable_host; pub mod grpc; +pub mod identity; pub mod metrics; pub mod model; pub mod preview2; @@ -1061,15 +1062,18 @@ pub async fn bootstrap_and_run_worker_executor< let leak_detector = worker_executor_impl.leak_detector(); - join_set.spawn(crate::metrics::runtime::run_runtime_metrics_loop( + let runtime_metrics = crate::metrics::runtime::install_runtime_metrics( runtime.clone(), - )); + golem_config.runtime_metrics_sampling_interval, + join_set, + ); let grpc_port = run_grpc_server(worker_executor_impl, lazy_worker_activator, join_set).await?; - let http_port = golem_service_base::observability::start_health_and_metrics_server( + let http_port = golem_service_base::observability::start_health_and_metrics_server_with_extra( golem_config.http_addr()?, prometheus_registry, + runtime_metrics, "Worker executor is running", join_set, ) diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs index 237c4cb0f4..980ae3842d 100644 --- a/golem-worker-executor/src/metrics.rs +++ b/golem-worker-executor/src/metrics.rs @@ -114,95 +114,79 @@ pub mod component { } pub mod runtime { - use std::time::{Duration, Instant}; + use std::sync::Arc; + use std::time::Duration; - use lazy_static::lazy_static; - use prometheus::*; + use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle}; use tokio::runtime::Handle; + use tokio::task::JoinSet; + use tokio_metrics::RuntimeMetricsReporterBuilder; - lazy_static! { - /// Number of tasks currently sitting in the tokio runtime's global - /// (injection) queue: runnable but not yet polled by any worker thread. - /// A persistently non-zero value means ready tasks (including I/O - /// continuations such as DB-response handling) are waiting for a worker - /// thread, which inflates I/O latency metrics even when the underlying - /// I/O is fast. - static ref GLOBAL_QUEUE_DEPTH: IntGauge = register_int_gauge!( - "executor_tokio_global_queue_depth", - "Tasks scheduled in the tokio runtime global queue, runnable but not yet polled" - ) - .unwrap(); - - /// Number of worker threads in the multi-thread runtime. - static ref NUM_WORKERS: IntGauge = register_int_gauge!( - "executor_tokio_num_workers", - "Number of tokio runtime worker threads" - ) - .unwrap(); - - /// Current number of alive tasks in the runtime. - static ref NUM_ALIVE_TASKS: IntGauge = register_int_gauge!( - "executor_tokio_num_alive_tasks", - "Number of alive tasks in the tokio runtime" - ) - .unwrap(); + /// How often the recorder's upkeep runs to keep its internal storage + /// bounded (e.g. pruning idle metrics once an idle timeout is configured). + const UPKEEP_INTERVAL: Duration = Duration::from_secs(30); - /// Per-worker busy ratio over the last sampling interval: the fraction - /// of wall-clock time the worker spent executing tasks. A value near 1.0 - /// means the worker is CPU-saturated and cannot promptly poll newly - /// ready tasks. - static ref WORKER_BUSY_RATIO: GaugeVec = register_gauge_vec!( - "executor_tokio_worker_busy_ratio", - "Fraction of wall-clock time each tokio worker spent busy over the sampling interval", - &["worker"] - ) - .unwrap(); - } - - /// Background loop that samples stable tokio runtime metrics and exports them - /// to Prometheus. + /// Installs a dedicated `metrics`-crate Prometheus recorder for tokio + /// runtime metrics, spawns the tokio-metrics reporter on `join_set`, and + /// returns a renderer that emits the collected metrics in Prometheus text + /// format. /// - /// All metrics used here are stable as of tokio 1.45 (the workspace resolves - /// 1.50+), so this requires neither the `tokio_unstable` cfg nor any build - /// flag. `global_queue_depth` is the primary diagnostic for runtime - /// scheduling pressure; `worker_busy_ratio` corroborates it by showing - /// per-worker CPU saturation. Never returns. - pub async fn run_runtime_metrics_loop(handle: Handle) -> anyhow::Result<()> { - const INTERVAL: Duration = Duration::from_secs(5); - - let metrics = handle.metrics(); - let num_workers = metrics.num_workers(); - NUM_WORKERS.set(num_workers as i64); - - // Previous cumulative busy duration per worker, for computing the busy - // ratio over each interval. - let mut prev_busy: Vec = (0..num_workers) - .map(|w| metrics.worker_total_busy_duration(w)) - .collect(); - let mut prev_instant = Instant::now(); - - let mut interval = tokio::time::interval(INTERVAL); - interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - loop { - interval.tick().await; - - GLOBAL_QUEUE_DEPTH.set(metrics.global_queue_depth() as i64); - NUM_ALIVE_TASKS.set(metrics.num_alive_tasks() as i64); - - let now = Instant::now(); - let elapsed = now.duration_since(prev_instant).as_secs_f64(); - prev_instant = now; - if elapsed > 0.0 { - for (w, prev) in prev_busy.iter_mut().enumerate() { - let busy = metrics.worker_total_busy_duration(w); - let delta = busy.saturating_sub(*prev).as_secs_f64(); - *prev = busy; - WORKER_BUSY_RATIO - .with_label_values(&[&w.to_string()]) - .set((delta / elapsed).min(1.0)); - } + /// `sampling_interval` controls how often metrics are sampled from the + /// runtime into the recorder; Prometheus scrapes the rendered values + /// independently. + /// + /// The returned closure is appended to the `prometheus`-crate scrape output + /// on the shared `/metrics` endpoint, so all `tokio_*` series appear on the + /// same endpoint as the rest of the executor's metrics, carrying the same + /// `executor_id` label. + /// + /// Returns `None` if a global `metrics` recorder is already installed (which + /// should not happen in the executor), in which case runtime metrics are + /// simply not exported. + pub fn install_runtime_metrics( + runtime: Handle, + sampling_interval: Duration, + join_set: &mut JoinSet>, + ) -> Option String + Send + Sync>> { + let executor_id = crate::identity::executor_id(); + + let handle: PrometheusHandle = match PrometheusBuilder::new() + .add_global_label("executor_id", executor_id) + .install_recorder() + { + Ok(handle) => handle, + Err(err) => { + tracing::warn!( + "Failed to install tokio runtime metrics recorder, runtime metrics will not be exported: {err}" + ); + return None; } - } + }; + + let reporter = RuntimeMetricsReporterBuilder::default().with_interval(sampling_interval); + join_set.spawn_on( + async move { + reporter.describe_and_run().await; + Ok(()) + }, + &runtime, + ); + + // Run periodic upkeep so the recorder's internal storage stays bounded. + let upkeep_handle = handle.clone(); + join_set.spawn_on( + async move { + let mut interval = tokio::time::interval(UPKEEP_INTERVAL); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + loop { + interval.tick().await; + upkeep_handle.run_upkeep(); + } + }, + &runtime, + ); + + Some(Arc::new(move || handle.render())) } } @@ -874,16 +858,10 @@ pub mod storage { use lazy_static::lazy_static; use prometheus::*; - /// Returns the executor identity label: POD_NAME env var, falling back to HOSTNAME, then "unknown". - /// Resolved once on first call and cached for the lifetime of the process. - pub fn executor_id() -> &'static str { - static EXECUTOR_ID: std::sync::OnceLock = std::sync::OnceLock::new(); - EXECUTOR_ID.get_or_init(|| { - std::env::var("POD_NAME") - .or_else(|_| std::env::var("HOSTNAME")) - .unwrap_or_else(|_| "unknown".to_string()) - }) - } + /// Re-exported from [`crate::identity`], which owns the process identity. + /// Kept here so existing metric-recording call sites can keep using + /// `crate::metrics::storage::executor_id()`. + pub use crate::identity::executor_id; lazy_static! { pub static ref STORAGE_FILESYSTEM_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!( diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index a11a411f77..946b20dae6 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -73,6 +73,11 @@ pub struct GolemConfig { pub max_websocket_connections: usize, pub http_address: String, pub http_port: u16, + /// How often tokio runtime metrics are sampled from the runtime and pushed + /// into the metrics recorder exposed on `/metrics`. Prometheus scrapes the + /// rendered values independently; this is the in-process resolution. + #[serde(with = "humantime_serde")] + pub runtime_metrics_sampling_interval: Duration, } impl SafeDisplay for GolemConfig { @@ -284,6 +289,7 @@ impl Default for GolemConfig { max_websocket_connections: 100, http_address: "0.0.0.0".to_string(), http_port: 8082, + runtime_metrics_sampling_interval: Duration::from_secs(5), } } } From 78d311d151ca40ffc3ab17de207df5f50009eff7 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:50:05 -0700 Subject: [PATCH 48/60] feat: use official tokio-metrics crate to expose tokio runtime metrics vol 2 --- Cargo.lock | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 3 ++ 2 files changed, 107 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 9296de06e0..3239afe40f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4277,6 +4277,8 @@ dependencies = [ "log", "mac_address", "md5", + "metrics", + "metrics-exporter-prometheus", "metrohash", "nonempty-collections", "nonzero_ext", @@ -4301,6 +4303,7 @@ dependencies = [ "tempfile", "test-r", "tokio", + "tokio-metrics", "tokio-stream", "tokio-tungstenite 0.25.0", "tokio-util", @@ -6007,6 +6010,46 @@ dependencies = [ "autocfg", ] +[[package]] +name = "metrics" +version = "0.24.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89550ee9f79e88fef3119de263694973a8adb26c21d75322164fb8c493039fe2" +dependencies = [ + "portable-atomic", + "rapidhash", +] + +[[package]] +name = "metrics-exporter-prometheus" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd7399781913e5393588a8d8c6a2867bf85fb38eaf2502fdce465aad2dc6f034" +dependencies = [ + "base64 0.22.1", + "indexmap 2.14.0", + "metrics", + "metrics-util", + "quanta", + "thiserror 1.0.69", +] + +[[package]] +name = "metrics-util" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8496cc523d1f94c1385dd8f0f0c2c480b2b8aeccb5b7e4485ad6365523ae376" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "hashbrown 0.15.5", + "metrics", + "quanta", + "rand 0.9.2", + "rand_xoshiro", + "sketches-ddsketch", +] + [[package]] name = "metrohash" version = "1.0.7" @@ -7669,6 +7712,21 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + [[package]] name = "quick-error" version = "1.2.3" @@ -7856,6 +7914,15 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.5", +] + [[package]] name = "range-set-blaze" version = "0.1.16" @@ -7868,6 +7935,24 @@ dependencies = [ "num-traits", ] +[[package]] +name = "rapidhash" +version = "4.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59" +dependencies = [ + "rustversion", +] + +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags 2.11.1", +] + [[package]] name = "rayon" version = "1.11.0" @@ -9103,6 +9188,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" +[[package]] +name = "sketches-ddsketch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" + [[package]] name = "slab" version = "0.4.12" @@ -9946,6 +10037,19 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "tokio-metrics" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9e81d53caf955549b1dec7af4ac2149e94cc25ed97b4a545151140281e2f528" +dependencies = [ + "futures-util", + "metrics", + "pin-project-lite", + "tokio", + "tokio-stream", +] + [[package]] name = "tokio-native-tls" version = "0.3.1" diff --git a/Cargo.toml b/Cargo.toml index b6ba881258..eac4fa4458 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -154,6 +154,8 @@ log = "0.4.26" mac_address = "1.1.8" mappable-rc = "0.1.1" md5 = "0.7.0" +metrics = "0.24.2" +metrics-exporter-prometheus = { version = "0.16.2", default-features = false } metrohash = "1.0.7" miette = { version = "7.6.0", features = ["fancy"] } mime = "0.3.17" @@ -248,6 +250,7 @@ textwrap = "0.16.1" thiserror = "2.0.12" time = { version = "0.3.41", features = ["default", "macros"] } tokio = { version = "1.44", features = ["macros", "rt-multi-thread", "sync", "io-std", "net", "tracing", "process", "signal"] } +tokio-metrics = { version = "0.5.0", features = ["metrics-rs-integration"] } tokio-postgres = "0.7.13" tokio-rustls = { version = "0.26.2" } tokio-stream = { version = "0.1", features = ["sync"] } From aec411b7275491efe6311313513df5d2662f12c3 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:59:53 -0700 Subject: [PATCH 49/60] chore: cleanup comments --- .../concurrent_agents_scheduler.rs | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs index 2391fce697..3d20d187b6 100644 --- a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs +++ b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs @@ -55,22 +55,18 @@ struct QueuedAgent { /// the responsibility to decrement the account's `running_count` and wake the /// next queued agent when it is released. /// -/// Crucially, the `running_count` was incremented *together with* acquiring the -/// raw permit, and the matching decrement lives **only** here in `Drop`. This -/// binds the count strictly to the lifetime of the granted permit, regardless -/// of how the slot is ultimately disposed of: +/// The `running_count` is incremented together with acquiring the raw permit, +/// and the matching decrement lives only here in `Drop`. This binds the count +/// strictly to the lifetime of the granted permit, regardless of how the slot +/// is disposed of: /// -/// * It is moved into a [`ConcurrentAgentPermit`] and dropped when the agent +/// * it is moved into a [`ConcurrentAgentPermit`] and dropped when the agent /// releases the slot (the normal case), or /// * it is sent into a queued waiter's oneshot and that waiter is cancelled -/// before receiving it — the slot is then dropped *inside* the channel. +/// before receiving it — the slot is then dropped inside the channel. /// /// Both paths run this same `Drop`, so a slot granted to a waiter that is -/// cancelled after the grant succeeded cannot leak the count. (A previous -/// design decremented only when the oneshot `send` failed, which left -/// `running_count` permanently inflated when a waiter was cancelled *after* a -/// successful send — wedging the whole account once the count reached the -/// limit.) +/// cancelled after the grant succeeded cannot leak the count. struct GrantedSlot { raw: Option, account: Arc, @@ -359,7 +355,7 @@ fn try_grant_next_sync(account: &Arc, account_id: &AccountId) /// uses `try_acquire_owned` which does not block. /// /// Each granted permit is wrapped in a [`GrantedSlot`] carrying the -/// `running_count` decrement, so a waiter cancelled *after* a successful send +/// `running_count` decrement, so a waiter cancelled after a successful send /// still releases its slot (via the slot's `Drop` when the oneshot channel is /// dropped) rather than leaking the count. The increment here is matched /// one-for-one by that slot's `Drop`. From 71bee784c2bcd44a42061c2e897be5036810f458 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 12 Jun 2026 22:05:30 -0700 Subject: [PATCH 50/60] feat: try mimalloc --- golem-worker-executor/Cargo.toml | 1 + golem-worker-executor/src/server.rs | 3 +++ 2 files changed, 4 insertions(+) diff --git a/golem-worker-executor/Cargo.toml b/golem-worker-executor/Cargo.toml index 2aa2f37532..9e83a12c1f 100644 --- a/golem-worker-executor/Cargo.toml +++ b/golem-worker-executor/Cargo.toml @@ -76,6 +76,7 @@ md5 = { workspace = true } metrics = { workspace = true } metrics-exporter-prometheus = { workspace = true } metrohash = { workspace = true } +mimalloc = { workspace = true } nonempty-collections = { workspace = true } nonzero_ext = { workspace = true } pgvector = { workspace = true } diff --git a/golem-worker-executor/src/server.rs b/golem-worker-executor/src/server.rs index fbd1c7e60c..18b286adcb 100644 --- a/golem-worker-executor/src/server.rs +++ b/golem-worker-executor/src/server.rs @@ -21,6 +21,9 @@ use std::sync::Arc; use tokio::task::JoinSet; use tracing::info; +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + fn main() -> Result<(), anyhow::Error> { match make_config_loader().load_or_dump_config() { Some(mut config) => { From b14682347ce50688ecc32e7109a3ecb862d3dfe1 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 12 Jun 2026 22:06:54 -0700 Subject: [PATCH 51/60] feat: try mimalloc vol 2 --- Cargo.lock | 19 +++++++++++++++++++ Cargo.toml | 1 + 2 files changed, 20 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 3239afe40f..1821488058 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4280,6 +4280,7 @@ dependencies = [ "metrics", "metrics-exporter-prometheus", "metrohash", + "mimalloc", "nonempty-collections", "nonzero_ext", "pgvector", @@ -5738,6 +5739,15 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +[[package]] +name = "libmimalloc-sys" +version = "0.1.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a45a52f43e1c16f667ccfe4dd8c85b7f7c204fd5e3bf46c5b0db9a5c3c0b8e9" +dependencies = [ + "cc", +] + [[package]] name = "libredox" version = "0.1.15" @@ -6086,6 +6096,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "mimalloc" +version = "0.1.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d4139bb28d14ad1facf21d5eb8825051b326e172d216b39f6d31df53cc97862" +dependencies = [ + "libmimalloc-sys", +] + [[package]] name = "mime" version = "0.3.17" diff --git a/Cargo.toml b/Cargo.toml index eac4fa4458..fbe1e4866e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -160,6 +160,7 @@ metrohash = "1.0.7" miette = { version = "7.6.0", features = ["fancy"] } mime = "0.3.17" mime_guess = "2.0.5" +mimalloc = "0.1.52" minijinja = "2.7.0" nanoid = "0.4.0" From 0ef2c16ce7bdf104479720011d199b9364acc87b Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 12 Jun 2026 22:56:33 -0700 Subject: [PATCH 52/60] perf: enable thin LTO and codegen-units=1 for release builds --- Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index fbe1e4866e..e9ba81fc0c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -341,6 +341,8 @@ debug = "line-tables-only" [profile.release] panic = "abort" +lto = "thin" +codegen-units = 1 [profile.benchmarks] inherits = "release" From 99e3633aa065990da6259798bdfb30d704215f10 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 12 Jun 2026 23:11:41 -0700 Subject: [PATCH 53/60] perf: pin target-cpu baseline for published images (x86-64-v3, neoverse-n1) --- .github/workflows/ci.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 095bdbd015..7a8d2fb9f1 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -836,10 +836,14 @@ jobs: - platform: linux/amd64 name: linux/amd64 target: x86_64-unknown-linux-gnu + target_cpu_env: CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS + target_cpu: "-C target-cpu=x86-64-v3" - platform: linux/arm64 name: linux/arm64 target: aarch64-unknown-linux-gnu cross: true + target_cpu_env: CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUSTFLAGS + target_cpu: "-C target-cpu=neoverse-n1" name: docker-targets-build (${{ matrix.platform.platform }}) steps: - uses: actions/checkout@v5 @@ -854,6 +858,12 @@ jobs: run: | platform=${{ matrix.platform.platform }} echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV + # Target-scoped rustflags pin the instruction-set baseline for the + # published images: x86-64-v3 on amd64, Neoverse-N1 on arm64. The + # per-target CARGO_TARGET_*_RUSTFLAGS form is used because plain + # RUSTFLAGS is ignored when cross-compiling; CARGO_-prefixed vars are + # also passed through into the cross container automatically. + echo "${{ matrix.platform.target_cpu_env }}=${{ matrix.platform.target_cpu }}" >> $GITHUB_ENV - run: cargo install cross if: ${{ matrix.platform.cross }} From 3218b4f8dc4acc2260a1341347c4eed117f0b4db Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Fri, 12 Jun 2026 23:25:38 -0700 Subject: [PATCH 54/60] perf: drop codegen-units=1, keep thin LTO The codegen-units=1 runtime gain is marginal once thin LTO is enabled, while it noticeably increases release build time. Revert to the default to keep intra-crate parallelism. --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e9ba81fc0c..32b71fd001 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -342,7 +342,6 @@ debug = "line-tables-only" [profile.release] panic = "abort" lto = "thin" -codegen-units = 1 [profile.benchmarks] inherits = "release" From ac3b3ee6145d9a5269e71af02ce5bb1049d9677d Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sat, 13 Jun 2026 20:29:38 -0700 Subject: [PATCH 55/60] chore: lower number of cuncurrent agents to 200 in case of durability overhead test --- integration-tests/benchmark_suites/cloud-perf.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index ea8ce74403..9253d128ad 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -126,5 +126,5 @@ benchmarks: - name: durability-overhead iterations: 1 clusterSize: [2] - size: [10, 50, 100, 250] + size: [10, 50, 100, 200] length: [5000] From ee49d867f0d7b6ff4d9b56f51155ff8ceb9ad97c Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sun, 14 Jun 2026 02:44:31 -0700 Subject: [PATCH 56/60] feat: restore 3 iterations --- .../benchmark_suites/cloud-perf.yaml | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml index 9253d128ad..ef8dd7d61f 100644 --- a/integration-tests/benchmark_suites/cloud-perf.yaml +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -29,7 +29,7 @@ benchmarks: # size = number of workers per implementation (×6 implementations total) # length = unused for echo - name: throughput-echo - iterations: 1 + iterations: 3 clusterSize: [2] size: [1, 50, 100, 250] length: [1000] @@ -40,7 +40,7 @@ benchmarks: # benchmark most relevant to the memory-admission investigation — sized to # match throughput-echo so it exercises real density. - name: throughput-large-input - iterations: 1 + iterations: 3 clusterSize: [2] size: [1, 50, 100, 250] length: [100, 10000] @@ -48,7 +48,7 @@ benchmarks: # size = number of workers per implementation # length = CPU work length passed to cpu_intensive - name: throughput-cpu-intensive - iterations: 1 + iterations: 3 clusterSize: [2] size: [1, 50, 100, 250] length: [100] @@ -58,14 +58,14 @@ benchmarks: # size = number of unique components created (each in its own env) # length = seconds to wait per component for pre-compilation warm-up - name: cold-start-unknown-small - iterations: 1 + iterations: 3 clusterSize: [2] size: [1, 5, 25, 50] length: [2] disableCompilationCache: true - name: cold-start-unknown-medium - iterations: 1 + iterations: 3 clusterSize: [2] size: [1, 5, 25, 50] length: [5] @@ -78,13 +78,13 @@ benchmarks: # NOTE: if results here are close to the cache-disabled entries above, the # warm-up wait is too short and compilation hasn't finished — bump length. - name: cold-start-unknown-small - iterations: 1 + iterations: 3 clusterSize: [2] size: [1, 5, 25, 50] length: [2] - name: cold-start-unknown-medium - iterations: 1 + iterations: 3 clusterSize: [2] size: [1, 5, 25, 50] length: [5] @@ -94,13 +94,13 @@ benchmarks: # size = number of workers created # length = number of hot invocations per worker after the first cold one - name: latency-small - iterations: 1 + iterations: 3 clusterSize: [2] size: [100, 500, 1000, 2000, 5000] length: [2] - name: latency-medium - iterations: 1 + iterations: 3 clusterSize: [2] size: [100, 500, 1000, 2000] length: [5] @@ -112,7 +112,7 @@ benchmarks: # size = number of workers launched in parallel # length = sleep duration in milliseconds - name: sleep - iterations: 1 + iterations: 3 clusterSize: [2] size: [10, 100, 500, 1000, 2000] length: [10000] @@ -124,7 +124,7 @@ benchmarks: # size = number of workers per variant # length = loop iteration count passed to oplog_heavy - name: durability-overhead - iterations: 1 + iterations: 3 clusterSize: [2] size: [10, 50, 100, 200] length: [5000] From e0bdf30affd862d6d83c101b727097d80e37fbdd Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sun, 14 Jun 2026 14:14:48 -0700 Subject: [PATCH 57/60] fix: release worker memory grant on cancelled start --- .../services/active_workers/admission/mod.rs | 117 +++++++-- .../src/services/active_workers/mod.rs | 56 ++--- .../src/services/active_workers/tests.rs | 222 ++++++++++++++++++ .../src/services/golem_config.rs | 4 +- golem-worker-executor/src/worker/mod.rs | 86 ++++--- 5 files changed, 408 insertions(+), 77 deletions(-) diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs index e9ca7f7079..02175c11ee 100644 --- a/golem-worker-executor/src/services/active_workers/admission/mod.rs +++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs @@ -37,10 +37,15 @@ //! both the burst race and later faulting of granted pages. //! //! The granted total is maintained by two integer updates: a worker's grant is -//! added on admission and removed on unload (via [`AdmissionController::release`] -//! from the worker lifecycle). The headroom check re-derives the reservation -//! from this maintained total and the current probe reading, so it is O(1) and -//! exact regardless of worker churn. +//! added on admission, and removed when the [`MemoryGrant`] guard returned by +//! admission is dropped. Tying the removal to the guard's drop — rather than to +//! an explicit release call on some worker-lifecycle path — keeps the accounting +//! symmetric no matter how a worker's start ends: whether it becomes resident and +//! later stops, or its start is cancelled mid-flight (e.g. the worker is deleted +//! while still waiting for permits), dropping the guard returns its reservation +//! exactly once. The headroom check re-derives the reservation from the +//! maintained total and the current probe reading, so it is O(1) and exact +//! regardless of worker churn. //! //! When headroom is short the controller evicts already-resident idle-then-warm //! work; if it still cannot make room it rejects rather than over-committing. @@ -51,12 +56,12 @@ use super::memory_probe::MemoryProbe; use async_trait::async_trait; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; /// Why an eviction candidate is worth evicting, in priority order. Lower /// variants are evicted first. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub enum EvictionPriority { +pub(crate) enum EvictionPriority { /// Resident in memory, not executing, no durable pending work. Cheapest to /// evict — losing it costs at most a re-load on next use. Idle, @@ -69,7 +74,7 @@ pub enum EvictionPriority { /// restore headroom. Abstracts over the live worker set so the decision logic /// is testable without `Worker`/wasmtime. #[async_trait] -pub trait EvictionSource: Send + Sync { +pub(crate) trait EvictionSource: Send + Sync { /// Evict at the given priority tier, attempting to free at least /// `needed_bytes`. Returns the number of bytes actually reclaimed (which may /// be less if the tier is exhausted, or more if a single victim was larger @@ -80,7 +85,7 @@ pub trait EvictionSource: Send + Sync { /// The outcome of an admission attempt. #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum AdmissionDecision { +pub(crate) enum AdmissionDecision { /// There is enough real headroom (possibly after eviction) to admit the /// request without risking the limit. Admit, @@ -96,7 +101,7 @@ pub enum AdmissionDecision { /// arenas, runtime buffers). Mirrors `worker_memory_ratio`, but applied to the /// measured limit rather than the configured total. #[derive(Debug, Clone, Copy)] -pub struct AdmissionPolicy { +pub(crate) struct AdmissionPolicy { /// Fraction (0.0..=1.0) of the measured limit usable for WASM admission. pub usable_ratio: f64, } @@ -106,7 +111,7 @@ pub struct AdmissionPolicy { /// probe on each call. The only retained state is `granted`: the total linear /// memory granted to live workers, maintained across admit and unload, which the /// gate reserves so a worker cannot OOM the node by faulting in granted pages. -pub struct AdmissionController { +pub(crate) struct AdmissionController { probe: Box, policy: AdmissionPolicy, granted: Mutex, @@ -181,7 +186,7 @@ impl AdmissionController { /// becomes resident and shared by all its workers. Unlike admission this does /// not evict or reject (the worker is already in); it accounts the bytes so /// later admissions see them. Released with [`Self::release`]. - pub fn reserve_committed(&self, bytes: u64) { + pub(crate) fn reserve_committed(&self, bytes: u64) { self.reserve(bytes); } @@ -189,7 +194,7 @@ impl AdmissionController { /// granted. Its pages leave memory, so its grant no longer needs reserving; /// not releasing it would permanently shrink admissible headroom as workers /// come and go. - pub fn release(&self, reserved_bytes: u64) { + pub(crate) fn release(&self, reserved_bytes: u64) { let mut granted = self.granted.lock().unwrap(); *granted = granted.saturating_sub(reserved_bytes); crate::metrics::workers::record_worker_memory_granted(*granted); @@ -211,7 +216,7 @@ impl AdmissionController { /// headroom is re-measured against ground truth; the request is admitted only /// if the real headroom now covers it, otherwise it is rejected. On admit the /// request is added to the in-flight reservation. - pub async fn try_admit( + async fn try_admit( &self, request_bytes: u64, source: &dyn EvictionSource, @@ -244,11 +249,91 @@ impl AdmissionController { } } - /// The current admissible headroom. Exposed for metrics and for callers that - /// want to make their own pre-check. - pub fn headroom_bytes(&self) -> u64 { + /// The current admissible headroom. Used by tests to assert the gate's + /// accounting; production reads headroom indirectly through admission. + #[cfg(test)] + pub(crate) fn headroom_bytes(&self) -> u64 { self.admissible_headroom() } + + /// Like [`Self::try_admit`], but on admit returns a [`MemoryGrant`] guard + /// that owns the reservation and releases it on drop. The grant a starting + /// worker holds passes through several `.await` points before the worker + /// becomes resident (per-account concurrency, component charge, filesystem + /// storage); if that work is cancelled — as when the worker is deleted while + /// still waiting — the guard's drop returns the reservation, so a cancelled + /// start cannot leak headroom. + pub(crate) async fn try_admit_grant( + self: &Arc, + request_bytes: u64, + source: &dyn EvictionSource, + ) -> Option { + match self.try_admit(request_bytes, source).await { + AdmissionDecision::Admit => Some(MemoryGrant { + controller: Some(self.clone()), + bytes: request_bytes, + }), + AdmissionDecision::Reject => None, + } + } +} + +/// Owns a memory reservation made with the [`AdmissionController`] and returns it +/// to the gate when dropped, so a reservation is released exactly once regardless +/// of whether the worker became resident or its start was cancelled. +/// +/// When measured admission is disabled (no controller) the grant is inert: it +/// reserves nothing and releasing it is a no-op, so callers can hold a grant +/// uniformly without branching on whether admission is active. +pub(crate) struct MemoryGrant { + controller: Option>, + bytes: u64, +} + +impl MemoryGrant { + /// An inert grant for when measured admission is disabled: holds no + /// reservation and releases nothing on drop. + pub(crate) fn inert() -> Self { + Self { + controller: None, + bytes: 0, + } + } + + /// Fold another grant's bytes into this one, so a worker that grows its + /// memory carries a single grant covering its whole reservation. The other + /// grant is consumed and its reservation transferred here; the combined total + /// is released exactly once when this grant drops. + pub(crate) fn merge(&mut self, mut other: MemoryGrant) { + if other.controller.is_some() { + // Adopt the controller so a merged grant acquired while admission was + // enabled still releases, even if `self` started inert. + if self.controller.is_none() { + self.controller = other.controller.take(); + } + self.bytes += other.bytes; + } + // Neutralize the absorbed grant so its drop does not release the bytes + // now owned by `self`. + other.bytes = 0; + other.controller = None; + } +} + +impl std::fmt::Debug for MemoryGrant { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MemoryGrant") + .field("bytes", &self.bytes) + .finish() + } +} + +impl Drop for MemoryGrant { + fn drop(&mut self) { + if let Some(controller) = &self.controller { + controller.release(self.bytes); + } + } } #[cfg(test)] diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs index 4ac70f8744..9cccc56112 100644 --- a/golem-worker-executor/src/services/active_workers/mod.rs +++ b/golem-worker-executor/src/services/active_workers/mod.rs @@ -29,7 +29,8 @@ pub use fs_semaphore::{ filesystem_storage_permits_to_bytes, filesystem_storage_pool_bytes_to_permits, }; -use admission::{AdmissionController, AdmissionDecision, EvictionPriority, EvictionSource}; +pub(crate) use admission::MemoryGrant; +use admission::{AdmissionController, EvictionPriority, EvictionSource}; use async_trait::async_trait; pub use component_charge::HeldComponentCharge; use component_charge::{ChargeSource, ComponentChargeGuard, ComponentChargeRegistry}; @@ -235,7 +236,10 @@ impl ActiveWorkers { } /// Blocking memory admission for a starting worker. Loops until the gate - /// admits the request, backing off between attempts. + /// admits the request, backing off between attempts, and returns a + /// [`MemoryGrant`] guard owning the reservation: the worker holds it for as + /// long as it is resident and releases it by dropping the guard, so a start + /// cancelled before the worker becomes resident cannot leak the reservation. /// /// A rejection is transient, not terminal. The gate reads resident memory /// from the probe, which lags real usage (cgroup `memory.current` only counts @@ -244,18 +248,19 @@ impl ActiveWorkers { /// Each iteration backs off and re-reads the gate, so the caller eventually /// proceeds once headroom recovers rather than failing under momentary /// pressure. With measured admission disabled the worker is admitted - /// immediately. - pub async fn acquire(&self, memory: u64) { + /// immediately with an inert grant. + pub(crate) async fn acquire(&self, memory: u64) -> MemoryGrant { let Some(admission) = &self.admission else { - return; + return MemoryGrant::inert(); }; loop { // Evicts idle-then-warm when real headroom is short; rejects (and we // back off) when it cannot make room rather than risking the limit. - if admission.try_admit(memory, &self.eviction_source()).await - == AdmissionDecision::Admit + if let Some(grant) = admission + .try_admit_grant(memory, &self.eviction_source()) + .await { - return; + return grant; } debug!("Measured headroom insufficient for {memory}, backing off and retrying"); tokio::time::sleep(self.acquire_retry_delay).await; @@ -271,33 +276,28 @@ impl ActiveWorkers { } /// Non-blocking memory admission for a growing worker. A single gate attempt: - /// returns `true` when the grow is admitted, `false` when real headroom is - /// insufficient even after eviction (the caller turns this into a retriable - /// out-of-memory trap). With measured admission disabled the grow is always - /// admitted. - pub async fn try_acquire(&self, memory: u64) -> bool { + /// returns the additional [`MemoryGrant`] when the grow is admitted, or `None` + /// when real headroom is insufficient even after eviction (the caller turns + /// `None` into a retriable out-of-memory trap). The returned grant should be + /// merged into the worker's existing grant so its whole reservation is + /// released together on unload. With measured admission disabled the grow is + /// always admitted with an inert grant. + pub(crate) async fn try_acquire(&self, memory: u64) -> Option { let Some(admission) = &self.admission else { - return true; + return Some(MemoryGrant::inert()); }; - match admission.try_admit(memory, &self.eviction_source()).await { - AdmissionDecision::Admit => true, - AdmissionDecision::Reject => { + match admission + .try_admit_grant(memory, &self.eviction_source()) + .await + { + Some(grant) => Some(grant), + None => { debug!("Measured headroom insufficient for {memory}, not admitting"); - false + None } } } - /// Release the memory a worker reserved with the admission gate when it - /// unloads. `bytes` must be the cumulative amount the worker reserved through - /// [`Self::acquire`] and [`Self::try_acquire`], so the gate's granted total - /// stays symmetric. No-op when measured admission is disabled. - pub fn release_memory(&self, bytes: u64) { - if let Some(admission) = &self.admission { - admission.release(bytes); - } - } - /// Blocking acquire of storage semaphore permits. Loops until the requested /// number of bytes is available, evicting idle workers as needed. pub async fn acquire_filesystem_storage(&self, storage_bytes: u64) -> FilesystemStoragePermit { diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs index 1f6c8313cf..66017602a3 100644 --- a/golem-worker-executor/src/services/active_workers/tests.rs +++ b/golem-worker-executor/src/services/active_workers/tests.rs @@ -1060,3 +1060,225 @@ mod scheduler_liveness { Ok(()) } } + +// ── Grant-guard liveness under random churn ────────────────────────────────── +// +// A worker's memory grant is reserved with the admission gate and then owned by +// a guard that lives in one of three places over the worker's lifetime: in the +// in-flight start task (waiting for permits), in the resident worker (started), +// or dropped (the worker exited or its start was cancelled). The liveness +// invariant — mirroring `scheduler_liveness` for the concurrent-agents scheduler +// — is that however the guard travels between those places, the gate's +// accounting stays symmetric: once every guard is gone, admissible headroom +// returns to the full ceiling. A reservation released zero times (leak, the +// cancelled-while-waiting deletion bug) or more than once (double-release) breaks +// it. With a zero-usage probe, headroom is `ceiling - granted`, so the final +// headroom reads the granted total directly. +mod grant_guard_liveness { + use super::super::admission::{ + AdmissionController, AdmissionPolicy, EvictionPriority, EvictionSource, MemoryGrant, + }; + use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot}; + use proptest::prelude::*; + use std::sync::Arc; + use std::time::Duration; + use test_r::test; + use tokio::task::JoinHandle; + + /// Probe with a fixed limit reporting zero resident usage, so admissible + /// headroom equals `ceiling - granted` and reads the granted accounting + /// directly — the quantity a leaked or double-released grant corrupts. + #[derive(Debug)] + struct ZeroUsageProbe { + limit: u64, + } + + impl MemoryProbe for ZeroUsageProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.limit, + current_bytes: 0, + } + } + } + + /// Nothing to evict: a rejected request stays rejected (the schedule keeps + /// total grants within the ceiling so admission only fails transiently, never + /// due to a leak the gate could not see). + struct NoEvictionSource; + + #[async_trait::async_trait] + impl EvictionSource for NoEvictionSource { + async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 { + 0 + } + } + + /// One step in a randomized grant-lifecycle workload. + #[derive(Debug, Clone)] + enum Op { + /// Begin a worker start: spawn a task that acquires a grant of this many + /// bytes and then parks holding it, as a worker waits for its remaining + /// permits before becoming resident. + Start(u64), + /// A still-in-flight start becomes resident: its task yields the grant + /// guard, which we keep (the worker is now running). + Resident(prop::sample::Index), + /// Cancel a still-in-flight start, as deleting a waiting worker does: + /// abort the task, dropping the grant guard it held. + CancelStart(prop::sample::Index), + /// A resident worker exits: drop its grant guard. + Exit(prop::sample::Index), + } + + fn arb_ops() -> impl Strategy> { + prop::collection::vec( + prop_oneof![ + 4 => (1u64..50).prop_map(Op::Start), + 2 => any::().prop_map(Op::Resident), + 3 => any::().prop_map(Op::CancelStart), + 2 => any::().prop_map(Op::Exit), + ], + 1..80, + ) + } + + /// An in-flight start: the task acquires the grant, then sends it back over + /// `ready` and parks, so the driver can either take the grant (the worker + /// became resident) or abort the task (the start was cancelled, dropping the + /// grant inside the task). + struct InFlight { + handle: JoinHandle<()>, + ready: tokio::sync::oneshot::Receiver, + } + + /// Drive one randomized workload and assert headroom recovers to the ceiling + /// once every grant guard is gone. + async fn run_workload(limit: u64, ops: Vec) -> Result<(), TestCaseError> { + let controller = Arc::new(AdmissionController::new( + Box::new(ZeroUsageProbe { limit }), + AdmissionPolicy { usable_ratio: 1.0 }, + )); + + let mut in_flight: Vec = Vec::new(); + let mut resident: Vec = Vec::new(); + + for op in ops { + match op { + Op::Start(bytes) => { + let controller = controller.clone(); + let (tx, rx) = tokio::sync::oneshot::channel(); + let handle = tokio::spawn(async move { + if let Some(grant) = + controller.try_admit_grant(bytes, &NoEvictionSource).await + { + // Hand the grant to the driver, then park holding the + // task alive so an abort drops the guard if the driver + // never took it. + let _ = tx.send(grant); + } + std::future::pending::<()>().await; + }); + in_flight.push(InFlight { handle, ready: rx }); + } + Op::Resident(idx) => { + if !in_flight.is_empty() { + let i = idx.index(in_flight.len()); + let started = in_flight.remove(i); + // Take the grant out of the task (worker is now resident), + // then abort the now-grantless parked task. + match started.ready.await { + Ok(grant) => { + resident.push(grant); + started.handle.abort(); + let _ = started.handle.await; + } + Err(_) => { + // Admission was rejected; nothing was granted. + started.handle.abort(); + let _ = started.handle.await; + } + } + } + } + Op::CancelStart(idx) => { + if !in_flight.is_empty() { + let i = idx.index(in_flight.len()); + let started = in_flight.remove(i); + // Delete a waiting worker: abort mid-flight. If the task + // had already acquired its grant, the guard is dropped + // inside the aborted task. + started.handle.abort(); + let _ = started.handle.await; + } + } + Op::Exit(idx) => { + if !resident.is_empty() { + let i = idx.index(resident.len()); + drop(resident.remove(i)); + } + } + } + // Let acquires/aborts settle so the granted accounting is observable. + for _ in 0..4 { + tokio::task::yield_now().await; + } + } + + // Tear everything down: abort remaining starts, drop remaining resident + // grants. The environment is now empty. + for started in in_flight.drain(..) { + started.handle.abort(); + let _ = started.handle.await; + } + resident.clear(); + // Let the final drops' releases settle. + tokio::time::sleep(Duration::from_millis(20)).await; + + let headroom = controller.headroom_bytes(); + prop_assert_eq!( + headroom, + limit, + "headroom did not recover to ceiling {} after all grants were released (got {}); \ + a grant leaked or was double-released across the lifecycle", + limit, + headroom + ); + + // And the gate must be live again: a fresh full-ceiling admission fits. + let readmit = controller.try_admit_grant(limit, &NoEvictionSource).await; + prop_assert!( + readmit.is_some(), + "gate refused a full-ceiling admission after draining; headroom is wedged" + ); + Ok(()) + } + + proptest! { + #![proptest_config(ProptestConfig { cases: 128, max_shrink_iters: 64, ..ProptestConfig::default() })] + + /// Liveness: under any interleaving of start / become-resident / + /// cancel-start / exit, once every grant guard is gone the gate's + /// admissible headroom returns to the full ceiling and admits again. A + /// grant that leaks on cancellation (or is released twice) breaks this. + #[test] + fn grants_never_leak_under_random_churn( + limit in 200u64..4000, + ops in arb_ops(), + ) { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_time() + .build() + .unwrap(); + + rt.block_on(async move { + tokio::time::timeout(Duration::from_secs(10), run_workload(limit, ops)) + .await + .unwrap_or_else(|_| Err(TestCaseError::fail( + "grant workload did not complete within the timeout", + ))) + })?; + } + } +} diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index 946b20dae6..7e6ca1a298 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -988,7 +988,9 @@ impl MemoryConfig { /// The admission policy for the measured-headroom gate. Reuses /// `worker_memory_ratio` as the usable fraction of the measured limit (the /// host keeps the remainder). - pub fn admission_policy(&self) -> crate::services::active_workers::admission::AdmissionPolicy { + pub(crate) fn admission_policy( + &self, + ) -> crate::services::active_workers::admission::AdmissionPolicy { crate::services::active_workers::admission::AdmissionPolicy { usable_ratio: self.worker_memory_ratio, } diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs index 69dd1c769f..23ba710f81 100644 --- a/golem-worker-executor/src/worker/mod.rs +++ b/golem-worker-executor/src/worker/mod.rs @@ -27,7 +27,7 @@ use crate::durable_host::recover_stderr_logs; use crate::metrics::storage::record_filesystem_pool_released; use crate::model::{AgentConfig, ExecutionStatus, LookupResult, ReadFileResult, TrapType}; use crate::services::active_workers::{ - FilesystemStoragePermit, HeldComponentCharge, RegisteredConcurrentAccount, + FilesystemStoragePermit, HeldComponentCharge, MemoryGrant, RegisteredConcurrentAccount, WorkerComponentCharge, }; use crate::services::events::{Event, EventsSubscription}; @@ -137,11 +137,6 @@ pub struct Worker { /// at least that many bytes from the blocking eviction path, ensuring /// enough idle workers are evicted to satisfy the pending write. desired_extra_filesystem_storage: AtomicU64, - /// Cumulative memory bytes this worker has reserved with the admission gate: - /// its initial requirement plus every grow delta. Released back to the gate - /// in full when the worker unloads, so the gate's granted total stays exactly - /// symmetric with what was reserved. - granted_memory: AtomicU64, } impl HasOplog for Worker { @@ -354,7 +349,6 @@ impl Worker { last_resume_request: Mutex::new(Timestamp::now_utc()), snapshot_recovery_disabled: AtomicBool::new(false), desired_extra_filesystem_storage: AtomicU64::new(0), - granted_memory: AtomicU64::new(0), }; // Wire the worker event service into the forwarding oplog so plugin errors @@ -1004,24 +998,26 @@ impl Worker { | WorkerInstance::Deleting => return Ok(()), } - if self.active_workers().try_acquire(delta).await { - self.granted_memory.fetch_add(delta, Ordering::Relaxed); - Ok(()) - } else { + let Some(extra_grant) = self.active_workers().try_acquire(delta).await else { crate::metrics::workers::record_worker_memory_grow_rejected(); - Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory)) - } - } + return Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory)); + }; - /// Release this worker's entire accumulated memory grant back to the - /// admission gate, resetting the running total to zero. Called when the - /// worker stops being resident; a later reload re-accumulates the grant from - /// scratch through the acquire path. - fn release_granted_memory(&self) { - let granted = self.granted_memory.swap(0, Ordering::Relaxed); - if granted > 0 { - self.active_workers().release_memory(granted); + // Re-check state under the lock: the worker may have changed state while + // the gate ran. If it is still running, merge the extra grant into the + // running worker so its whole reservation releases together on unload. + // Otherwise drop `extra_grant` here, returning the reservation to the + // gate, and treat the grow as a no-op (matching the non-running arms). + match &mut *self.instance.lock().await { + WorkerInstance::Running(running) => { + running.merge_extra_memory_grant(extra_grant); + } + WorkerInstance::Stopping(_) + | WorkerInstance::WaitingForPermit(_) + | WorkerInstance::Unloaded { .. } + | WorkerInstance::Deleting => {} } + Ok(()) } /// Return `freed_bytes` to the storage semaphore pool. @@ -1672,15 +1668,18 @@ impl Worker { // when stopping via the invocation loop we can stop immediately, no need to go via the stopping status if called_from_invocation_loop { crate::metrics::workers::dec_worker_memory_resident(); - self.release_granted_memory(); + // Dropping `running` at the end of this arm releases its + // memory grant (and component/storage permits) back to the + // gate. **instance_guard = final_state.into_instance(); StopResult::Stopped } else { // drop the running worker, this signals to the invocation loop to start exiting. + // `stop()` consumes the RunningWorker and drops everything but + // its join handle, releasing its memory grant back to the gate. let run_loop_handle = running.stop(); let notify = OneShotEvent::new(); crate::metrics::workers::dec_worker_memory_resident(); - self.release_granted_memory(); **instance_guard = WorkerInstance::Stopping(StoppingWorker { notify: notify.clone(), final_state, @@ -2229,6 +2228,7 @@ impl Worker { async fn start_waiting_worker( this: Arc>, + memory_grant: MemoryGrant, component_charge: WorkerComponentCharge, filesystem_storage_permit: Option, concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit, @@ -2244,6 +2244,7 @@ impl Worker { this.owned_agent_id.clone(), this.queue.clone(), this.clone(), + memory_grant, component_charge, concurrent_agent_permit, oom_retry_count, @@ -2258,9 +2259,8 @@ impl Worker { } _ => { debug!("worker was not waiting for permit anymore, not starting"); - // The grant was reserved before this call; the worker is not - // becoming resident, so release it rather than leak it. - this.release_granted_memory(); + // The worker is not becoming resident: dropping `memory_grant` + // here returns its reservation to the gate. } } } @@ -2401,10 +2401,15 @@ impl WaitingWorker { // Do not gate executor memory while waiting for a per-account // concurrency slot. Otherwise one account could exhaust the // memory headroom with workers that are not allowed to run yet. - parent.active_workers().acquire(memory_requirement).await; - parent - .granted_memory - .fetch_add(memory_requirement, Ordering::Relaxed); + // + // `memory_grant` owns the reservation from here on: it is held as + // a local until the worker becomes resident (when it moves into + // the RunningWorker) or this task ends/aborts (when dropping it + // returns the reservation to the gate). This is what makes a + // start cancelled mid-flight — e.g. the worker being deleted while + // still waiting for its remaining permits — release rather than + // leak its grant. + let memory_grant = parent.active_workers().acquire(memory_requirement).await; // Reserve the component's compiled module size once per resident // component (shared by all its workers). Held for as long as this // worker is resident; the module faults into RAM when the first @@ -2424,7 +2429,7 @@ impl WaitingWorker { warn!( "Failed to determine component charge requirement, not starting: {err}" ); - parent.release_granted_memory(); + // Dropping `memory_grant` here returns its reservation. return; } }; @@ -2478,6 +2483,7 @@ impl WaitingWorker { debug!("Attempting to start worker after acquiring enough permits"); Worker::start_waiting_worker( parent, + memory_grant, component_charge, filesystem_storage_permit, concurrent_agent_permit, @@ -2510,6 +2516,13 @@ struct RunningWorker { handle: Option>, sender: UnboundedSender, queue: Arc>>, + /// The worker's memory reservation with the admission gate, covering its + /// initial requirement plus any grow deltas merged in. Held only to be + /// dropped: dropping it (on stop, eviction, or this worker being dropped for + /// any reason) returns the reservation to the gate, keeping the granted total + /// symmetric with what was reserved. + #[allow(dead_code)] + memory_grant: MemoryGrant, /// Keeps this worker's component module charge alive while it is resident. /// Held only to be dropped: dropping it releases the component's residency /// (and the module reservation if this was the last worker of the component). @@ -2545,6 +2558,7 @@ impl RunningWorker { owned_agent_id: OwnedAgentId, queue: Arc>>, parent: Arc>, + memory_grant: MemoryGrant, component_charge: WorkerComponentCharge, concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit, oom_retry_count: u32, @@ -2595,6 +2609,7 @@ impl RunningWorker { handle: Some(handle), sender, queue, + memory_grant, component_charge: Box::new(component_charge), filesystem_storage_permit: None, waiting_for_command, @@ -2603,6 +2618,13 @@ impl RunningWorker { } } + /// Merge an additional memory grant (from a successful grow) into this + /// worker's grant, so its whole reservation is released together when the + /// worker unloads. + pub fn merge_extra_memory_grant(&mut self, extra: MemoryGrant) { + self.memory_grant.merge(extra); + } + /// Merge additional storage permits into this worker's storage permit. If /// the worker does not yet hold a storage permit, the given permit becomes /// the initial one. Additional calls merge into that initial permit. From 4f4a5cae343a460246447dc28e513408b598e6e6 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sun, 14 Jun 2026 14:52:32 -0700 Subject: [PATCH 58/60] feat: move tests around --- .../services/active_workers/admission/mod.rs | 18 +++--- .../src/services/active_workers/mod.rs | 10 +--- .../src/services/active_workers/tests.rs | 60 +++++++++---------- 3 files changed, 42 insertions(+), 46 deletions(-) diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs index 02175c11ee..ec57acd699 100644 --- a/golem-worker-executor/src/services/active_workers/admission/mod.rs +++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs @@ -256,14 +256,16 @@ impl AdmissionController { self.admissible_headroom() } - /// Like [`Self::try_admit`], but on admit returns a [`MemoryGrant`] guard - /// that owns the reservation and releases it on drop. The grant a starting - /// worker holds passes through several `.await` points before the worker - /// becomes resident (per-account concurrency, component charge, filesystem - /// storage); if that work is cancelled — as when the worker is deleted while - /// still waiting — the guard's drop returns the reservation, so a cancelled - /// start cannot leak headroom. - pub(crate) async fn try_admit_grant( + /// Admit `request_bytes`, evicting resident idle-then-warm work if needed, + /// and on success return a [`MemoryGrant`] guard that owns the reservation + /// and releases it on drop; `None` if the request cannot be admitted. + /// + /// The grant a starting worker holds passes through several `.await` points + /// before the worker becomes resident (per-account concurrency, component + /// charge, filesystem storage); if that work is cancelled — as when the + /// worker is deleted while still waiting — the guard's drop returns the + /// reservation, so a cancelled start cannot leak headroom. + pub(crate) async fn admit( self: &Arc, request_bytes: u64, source: &dyn EvictionSource, diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs index 9cccc56112..24784065b4 100644 --- a/golem-worker-executor/src/services/active_workers/mod.rs +++ b/golem-worker-executor/src/services/active_workers/mod.rs @@ -256,10 +256,7 @@ impl ActiveWorkers { loop { // Evicts idle-then-warm when real headroom is short; rejects (and we // back off) when it cannot make room rather than risking the limit. - if let Some(grant) = admission - .try_admit_grant(memory, &self.eviction_source()) - .await - { + if let Some(grant) = admission.admit(memory, &self.eviction_source()).await { return grant; } debug!("Measured headroom insufficient for {memory}, backing off and retrying"); @@ -286,10 +283,7 @@ impl ActiveWorkers { let Some(admission) = &self.admission else { return Some(MemoryGrant::inert()); }; - match admission - .try_admit_grant(memory, &self.eviction_source()) - .await - { + match admission.admit(memory, &self.eviction_source()).await { Some(grant) => Some(grant), None => { debug!("Measured headroom insufficient for {memory}, not admitting"); diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs index 66017602a3..217c0e21b6 100644 --- a/golem-worker-executor/src/services/active_workers/tests.rs +++ b/golem-worker-executor/src/services/active_workers/tests.rs @@ -1143,13 +1143,15 @@ mod grant_guard_liveness { ) } - /// An in-flight start: the task acquires the grant, then sends it back over - /// `ready` and parks, so the driver can either take the grant (the worker - /// became resident) or abort the task (the start was cancelled, dropping the - /// grant inside the task). + /// An in-flight start: the task runs admission, reports the outcome back over + /// `ready` (the grant on admit, `None` if the gate rejected it), then parks + /// holding the grant. The driver can take the grant (the worker became + /// resident) or abort the task (the start was cancelled, dropping any grant + /// inside the task). The outcome is always reported, so the driver never + /// blocks waiting on a start that was rejected. struct InFlight { handle: JoinHandle<()>, - ready: tokio::sync::oneshot::Receiver, + ready: tokio::sync::oneshot::Receiver>, } /// Drive one randomized workload and assert headroom recovers to the ceiling @@ -1169,14 +1171,15 @@ mod grant_guard_liveness { let controller = controller.clone(); let (tx, rx) = tokio::sync::oneshot::channel(); let handle = tokio::spawn(async move { - if let Some(grant) = - controller.try_admit_grant(bytes, &NoEvictionSource).await - { - // Hand the grant to the driver, then park holding the - // task alive so an abort drops the guard if the driver - // never took it. - let _ = tx.send(grant); - } + // Always report the admission outcome so the driver never + // blocks on a start that was rejected. On admit the grant + // travels to the driver (held in the channel until taken + // as resident or dropped on cancel); on reject we report + // `None`. + let outcome = controller.admit(bytes, &NoEvictionSource).await; + let _ = tx.send(outcome); + // Park so the task stays alive until the driver decides + // its fate (become resident, or be aborted on cancel). std::future::pending::<()>().await; }); in_flight.push(InFlight { handle, ready: rx }); @@ -1185,31 +1188,28 @@ mod grant_guard_liveness { if !in_flight.is_empty() { let i = idx.index(in_flight.len()); let started = in_flight.remove(i); - // Take the grant out of the task (worker is now resident), - // then abort the now-grantless parked task. - match started.ready.await { - Ok(grant) => { - resident.push(grant); - started.handle.abort(); - let _ = started.handle.await; - } - Err(_) => { - // Admission was rejected; nothing was granted. - started.handle.abort(); - let _ = started.handle.await; - } + // Becoming resident requires the start to have been + // admitted. Take the grant if there is one (worker is now + // running); a rejected start cannot become resident and is + // simply discarded. Either way abort the parked task. + if let Ok(Some(grant)) = started.ready.await { + resident.push(grant); } + started.handle.abort(); + let _ = started.handle.await; } } Op::CancelStart(idx) => { if !in_flight.is_empty() { let i = idx.index(in_flight.len()); let started = in_flight.remove(i); - // Delete a waiting worker: abort mid-flight. If the task - // had already acquired its grant, the guard is dropped - // inside the aborted task. + // Delete a waiting worker: abort the task and drop the + // `InFlight`. Any grant the start acquired is held in + // `started.ready`; dropping it returns the reservation, + // exactly as aborting a waiting worker mid-flight does. started.handle.abort(); let _ = started.handle.await; + drop(started.ready); } } Op::Exit(idx) => { @@ -1246,7 +1246,7 @@ mod grant_guard_liveness { ); // And the gate must be live again: a fresh full-ceiling admission fits. - let readmit = controller.try_admit_grant(limit, &NoEvictionSource).await; + let readmit = controller.admit(limit, &NoEvictionSource).await; prop_assert!( readmit.is_some(), "gate refused a full-ceiling admission after draining; headroom is wedged" From 4cfd2e714d5f686d26fe64dc7702b47984766fad Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sun, 14 Jun 2026 16:36:35 -0700 Subject: [PATCH 59/60] debug: add more logging to figure out why we leak memory and deadlock --- golem-worker-executor/src/worker/mod.rs | 51 ++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs index 23ba710f81..9601fd7a47 100644 --- a/golem-worker-executor/src/worker/mod.rs +++ b/golem-worker-executor/src/worker/mod.rs @@ -1702,7 +1702,7 @@ impl Worker { run_loop_handle, notify, } => { - run_loop_handle.await.expect("Failed to join run loop"); + join_run_loop_with_watchdog(&self.owned_agent_id, run_loop_handle).await; let mut instance_guard = self.instance.lock().await; let is_deleting = match &*instance_guard { @@ -2308,6 +2308,37 @@ impl Drop for WorkerStatusMetric { } } +/// Joins the invocation-loop task, logging a periodic warning if the join does +/// not complete promptly. +/// +/// The stop path drops the worker's command channel sender and then waits here +/// for the loop task to observe the close and return. If the loop is parked at +/// an await that never observes the closed channel, this join never completes +/// and the delete/stop that triggered it is wedged. The watchdog surfaces that +/// condition with the agent id so a stuck teardown is diagnosable from logs. +async fn join_run_loop_with_watchdog(owned_agent_id: &OwnedAgentId, handle: JoinHandle<()>) { + const WARN_AFTER: Duration = Duration::from_secs(10); + + let mut handle = handle; + let mut waited = Duration::ZERO; + loop { + match tokio::time::timeout(WARN_AFTER, &mut handle).await { + Ok(join_result) => { + join_result.expect("Failed to join run loop"); + return; + } + Err(_) => { + waited += WARN_AFTER; + warn!( + agent_id = %owned_agent_id, + waited_secs = waited.as_secs(), + "Still waiting for invocation loop to exit during stop; the loop task may be parked at an uninterruptible await" + ); + } + } + } +} + pub fn merge_agent_env_with_default_env( agent_env: Option>, default_agent_env: BTreeMap, @@ -2514,6 +2545,7 @@ impl Drop for WaitingWorker { #[derive(Debug)] struct RunningWorker { handle: Option>, + owned_agent_id: OwnedAgentId, sender: UnboundedSender, queue: Arc>>, /// The worker's memory reservation with the admission gate, covering its @@ -2550,6 +2582,19 @@ impl Drop for RunningWorker { record_filesystem_pool_released(bytes); } } + // A `RunningWorker` is normally torn down via `stop()`, which takes the + // handle so the invocation loop can be joined after its command channel + // closes. If the handle is still present here and the task has not + // finished, the loop task is being orphaned: it will keep running and + // hold its wasmtime `Store` even though the worker is gone. + if let Some(handle) = &self.handle + && !handle.is_finished() + { + warn!( + agent_id = %self.owned_agent_id, + "RunningWorker dropped while its invocation loop task is still running; the loop task is being orphaned and will leak its Store" + ); + } } } @@ -2568,6 +2613,7 @@ impl RunningWorker { let active_clone = queue.clone(); let owned_agent_id_clone = owned_agent_id.clone(); + let owned_agent_id_log = owned_agent_id.clone(); let waiting_for_command = Arc::new(AtomicBool::new(false)); let waiting_for_command_clone = waiting_for_command.clone(); let interrupt_signal = Arc::new(async_lock::Mutex::new(None)); @@ -2588,6 +2634,7 @@ impl RunningWorker { ); let handle = tokio::task::spawn( async move { + debug!(agent_id = %owned_agent_id_log, "Invocation loop task started"); RunningWorker::invocation_loop( receiver, active_clone, @@ -2601,12 +2648,14 @@ impl RunningWorker { ) .instrument(span) .await; + debug!(agent_id = %owned_agent_id_log, "Invocation loop task exited"); } .in_current_span(), ); RunningWorker { handle: Some(handle), + owned_agent_id, sender, queue, memory_grant, From faa6a2aa1c3079d301caa21341aee9e4694a9052 Mon Sep 17 00:00:00 2001 From: kmatasfp <33095685+kmatas@users.noreply.github.com> Date: Sun, 14 Jun 2026 21:30:55 -0700 Subject: [PATCH 60/60] debug: observe if Store references are kept alive --- golem-worker-executor/src/durable_host/mod.rs | 22 ++++++++ golem-worker-executor/src/metrics.rs | 24 +++++++++ golem-worker-executor/src/worker/mod.rs | 51 +------------------ 3 files changed, 47 insertions(+), 50 deletions(-) diff --git a/golem-worker-executor/src/durable_host/mod.rs b/golem-worker-executor/src/durable_host/mod.rs index 6b134d87fb..01052d8600 100644 --- a/golem-worker-executor/src/durable_host/mod.rs +++ b/golem-worker-executor/src/durable_host/mod.rs @@ -249,6 +249,27 @@ pub struct DurableWorkerCtx { execution_status: Arc>, pub websocket_connection_pool: websocket::WebSocketConnectionPool, resource_limits: Arc, + _store_alive_guard: StoreAliveGuard, +} + +/// Increments the live-`Store` gauge on construction and decrements it on drop. +/// Held as a field of [`DurableWorkerCtx`], which is the data of the wasmtime +/// `Store`, so the gauge follows the `Store`'s true lifetime regardless of which +/// reference keeps it alive. A persistent gap above the resident-worker count +/// indicates `Store`s retained after their worker was deleted. +struct StoreAliveGuard; + +impl StoreAliveGuard { + fn new() -> Self { + crate::metrics::workers::inc_worker_store_alive(); + StoreAliveGuard + } +} + +impl Drop for StoreAliveGuard { + fn drop(&mut self) { + crate::metrics::workers::dec_worker_store_alive(); + } } impl DurableWorkerCtx { @@ -476,6 +497,7 @@ impl DurableWorkerCtx { worker_dir, execution_status, resource_limits, + _store_alive_guard: StoreAliveGuard::new(), }) } diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs index 980ae3842d..3b4d976f6e 100644 --- a/golem-worker-executor/src/metrics.rs +++ b/golem-worker-executor/src/metrics.rs @@ -260,6 +260,12 @@ pub mod workers { &["executor_id"] ) .unwrap(); + pub static ref WORKER_STORE_ALIVE_COUNT: GaugeVec = register_gauge_vec!( + "golem_worker_store_alive_count", + "Live wasmtime Store contexts on this executor, counted by the Store's own lifetime: incremented when a worker's Store is constructed and decremented when it is dropped. Diverging above the resident-worker count means Stores are retained after the owning worker was deleted", + &["executor_id"] + ) + .unwrap(); pub static ref WORKER_KV_CACHE_VALUE_SIZE_BYTES: HistogramVec = register_histogram_vec!( "worker_kv_cache_value_size_bytes", "Bytes of a value written to the Worker-namespace KV cache (worker status blob size)", @@ -361,6 +367,7 @@ pub mod workers { WORKER_WAITING_FOR_MEMORY_COUNT .with_label_values(&[id]) .set(0.0); + WORKER_STORE_ALIVE_COUNT.with_label_values(&[id]).set(0.0); WORKER_MEMORY_GROW_REJECTED_TOTAL .with_label_values(&[id]) .inc_by(0.0); @@ -378,6 +385,23 @@ pub mod workers { .dec(); } + /// Incremented when a worker's wasmtime `Store` context is constructed. + /// Paired with [`dec_worker_store_alive`] from a guard dropped with the + /// `Store` itself, so the gauge tracks the `Store`'s true lifetime rather + /// than the owning worker's accounting. + pub fn inc_worker_store_alive() { + WORKER_STORE_ALIVE_COUNT + .with_label_values(&[crate::metrics::storage::executor_id()]) + .inc(); + } + + /// Decremented when a worker's wasmtime `Store` context is dropped. + pub fn dec_worker_store_alive() { + WORKER_STORE_ALIVE_COUNT + .with_label_values(&[crate::metrics::storage::executor_id()]) + .dec(); + } + pub fn inc_worker_waiting_for_memory() { WORKER_WAITING_FOR_MEMORY_COUNT .with_label_values(&[crate::metrics::storage::executor_id()]) diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs index 9601fd7a47..23ba710f81 100644 --- a/golem-worker-executor/src/worker/mod.rs +++ b/golem-worker-executor/src/worker/mod.rs @@ -1702,7 +1702,7 @@ impl Worker { run_loop_handle, notify, } => { - join_run_loop_with_watchdog(&self.owned_agent_id, run_loop_handle).await; + run_loop_handle.await.expect("Failed to join run loop"); let mut instance_guard = self.instance.lock().await; let is_deleting = match &*instance_guard { @@ -2308,37 +2308,6 @@ impl Drop for WorkerStatusMetric { } } -/// Joins the invocation-loop task, logging a periodic warning if the join does -/// not complete promptly. -/// -/// The stop path drops the worker's command channel sender and then waits here -/// for the loop task to observe the close and return. If the loop is parked at -/// an await that never observes the closed channel, this join never completes -/// and the delete/stop that triggered it is wedged. The watchdog surfaces that -/// condition with the agent id so a stuck teardown is diagnosable from logs. -async fn join_run_loop_with_watchdog(owned_agent_id: &OwnedAgentId, handle: JoinHandle<()>) { - const WARN_AFTER: Duration = Duration::from_secs(10); - - let mut handle = handle; - let mut waited = Duration::ZERO; - loop { - match tokio::time::timeout(WARN_AFTER, &mut handle).await { - Ok(join_result) => { - join_result.expect("Failed to join run loop"); - return; - } - Err(_) => { - waited += WARN_AFTER; - warn!( - agent_id = %owned_agent_id, - waited_secs = waited.as_secs(), - "Still waiting for invocation loop to exit during stop; the loop task may be parked at an uninterruptible await" - ); - } - } - } -} - pub fn merge_agent_env_with_default_env( agent_env: Option>, default_agent_env: BTreeMap, @@ -2545,7 +2514,6 @@ impl Drop for WaitingWorker { #[derive(Debug)] struct RunningWorker { handle: Option>, - owned_agent_id: OwnedAgentId, sender: UnboundedSender, queue: Arc>>, /// The worker's memory reservation with the admission gate, covering its @@ -2582,19 +2550,6 @@ impl Drop for RunningWorker { record_filesystem_pool_released(bytes); } } - // A `RunningWorker` is normally torn down via `stop()`, which takes the - // handle so the invocation loop can be joined after its command channel - // closes. If the handle is still present here and the task has not - // finished, the loop task is being orphaned: it will keep running and - // hold its wasmtime `Store` even though the worker is gone. - if let Some(handle) = &self.handle - && !handle.is_finished() - { - warn!( - agent_id = %self.owned_agent_id, - "RunningWorker dropped while its invocation loop task is still running; the loop task is being orphaned and will leak its Store" - ); - } } } @@ -2613,7 +2568,6 @@ impl RunningWorker { let active_clone = queue.clone(); let owned_agent_id_clone = owned_agent_id.clone(); - let owned_agent_id_log = owned_agent_id.clone(); let waiting_for_command = Arc::new(AtomicBool::new(false)); let waiting_for_command_clone = waiting_for_command.clone(); let interrupt_signal = Arc::new(async_lock::Mutex::new(None)); @@ -2634,7 +2588,6 @@ impl RunningWorker { ); let handle = tokio::task::spawn( async move { - debug!(agent_id = %owned_agent_id_log, "Invocation loop task started"); RunningWorker::invocation_loop( receiver, active_clone, @@ -2648,14 +2601,12 @@ impl RunningWorker { ) .instrument(span) .await; - debug!(agent_id = %owned_agent_id_log, "Invocation loop task exited"); } .in_current_span(), ); RunningWorker { handle: Some(handle), - owned_agent_id, sender, queue, memory_grant,