diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 095bdbd015..7a8d2fb9f1 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -836,10 +836,14 @@ jobs: - platform: linux/amd64 name: linux/amd64 target: x86_64-unknown-linux-gnu + target_cpu_env: CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS + target_cpu: "-C target-cpu=x86-64-v3" - platform: linux/arm64 name: linux/arm64 target: aarch64-unknown-linux-gnu cross: true + target_cpu_env: CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUSTFLAGS + target_cpu: "-C target-cpu=neoverse-n1" name: docker-targets-build (${{ matrix.platform.platform }}) steps: - uses: actions/checkout@v5 @@ -854,6 +858,12 @@ jobs: run: | platform=${{ matrix.platform.platform }} echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV + # Target-scoped rustflags pin the instruction-set baseline for the + # published images: x86-64-v3 on amd64, Neoverse-N1 on arm64. The + # per-target CARGO_TARGET_*_RUSTFLAGS form is used because plain + # RUSTFLAGS is ignored when cross-compiling; CARGO_-prefixed vars are + # also passed through into the cross container automatically. + echo "${{ matrix.platform.target_cpu_env }}=${{ matrix.platform.target_cpu }}" >> $GITHUB_ENV - run: cargo install cross if: ${{ matrix.platform.cross }} diff --git a/Cargo.lock b/Cargo.lock index 9296de06e0..1821488058 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4277,7 +4277,10 @@ dependencies = [ "log", "mac_address", "md5", + "metrics", + "metrics-exporter-prometheus", "metrohash", + "mimalloc", "nonempty-collections", "nonzero_ext", "pgvector", @@ -4301,6 +4304,7 @@ dependencies = [ "tempfile", "test-r", "tokio", + "tokio-metrics", "tokio-stream", "tokio-tungstenite 0.25.0", "tokio-util", @@ -5735,6 +5739,15 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +[[package]] +name = "libmimalloc-sys" +version = "0.1.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a45a52f43e1c16f667ccfe4dd8c85b7f7c204fd5e3bf46c5b0db9a5c3c0b8e9" +dependencies = [ + "cc", +] + [[package]] name = "libredox" version = "0.1.15" @@ -6007,6 +6020,46 @@ dependencies = [ "autocfg", ] +[[package]] +name = "metrics" +version = "0.24.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89550ee9f79e88fef3119de263694973a8adb26c21d75322164fb8c493039fe2" +dependencies = [ + "portable-atomic", + "rapidhash", +] + +[[package]] +name = "metrics-exporter-prometheus" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd7399781913e5393588a8d8c6a2867bf85fb38eaf2502fdce465aad2dc6f034" +dependencies = [ + "base64 0.22.1", + "indexmap 2.14.0", + "metrics", + "metrics-util", + "quanta", + "thiserror 1.0.69", +] + +[[package]] +name = "metrics-util" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8496cc523d1f94c1385dd8f0f0c2c480b2b8aeccb5b7e4485ad6365523ae376" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "hashbrown 0.15.5", + "metrics", + "quanta", + "rand 0.9.2", + "rand_xoshiro", + "sketches-ddsketch", +] + [[package]] name = "metrohash" version = "1.0.7" @@ -6043,6 +6096,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "mimalloc" +version = "0.1.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d4139bb28d14ad1facf21d5eb8825051b326e172d216b39f6d31df53cc97862" +dependencies = [ + "libmimalloc-sys", +] + [[package]] name = "mime" version = "0.3.17" @@ -7669,6 +7731,21 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + [[package]] name = "quick-error" version = "1.2.3" @@ -7856,6 +7933,15 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.5", +] + [[package]] name = "range-set-blaze" version = "0.1.16" @@ -7868,6 +7954,24 @@ dependencies = [ "num-traits", ] +[[package]] +name = "rapidhash" +version = "4.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59" +dependencies = [ + "rustversion", +] + +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags 2.11.1", +] + [[package]] name = "rayon" version = "1.11.0" @@ -9103,6 +9207,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" +[[package]] +name = "sketches-ddsketch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" + [[package]] name = "slab" version = "0.4.12" @@ -9946,6 +10056,19 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "tokio-metrics" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9e81d53caf955549b1dec7af4ac2149e94cc25ed97b4a545151140281e2f528" +dependencies = [ + "futures-util", + "metrics", + "pin-project-lite", + "tokio", + "tokio-stream", +] + [[package]] name = "tokio-native-tls" version = "0.3.1" diff --git a/Cargo.toml b/Cargo.toml index b6ba881258..32b71fd001 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -154,10 +154,13 @@ log = "0.4.26" mac_address = "1.1.8" mappable-rc = "0.1.1" md5 = "0.7.0" +metrics = "0.24.2" +metrics-exporter-prometheus = { version = "0.16.2", default-features = false } metrohash = "1.0.7" miette = { version = "7.6.0", features = ["fancy"] } mime = "0.3.17" mime_guess = "2.0.5" +mimalloc = "0.1.52" minijinja = "2.7.0" nanoid = "0.4.0" @@ -248,6 +251,7 @@ textwrap = "0.16.1" thiserror = "2.0.12" time = { version = "0.3.41", features = ["default", "macros"] } tokio = { version = "1.44", features = ["macros", "rt-multi-thread", "sync", "io-std", "net", "tracing", "process", "signal"] } +tokio-metrics = { version = "0.5.0", features = ["metrics-rs-integration"] } tokio-postgres = "0.7.13" tokio-rustls = { version = "0.26.2" } tokio-stream = { version = "0.1", features = ["sync"] } @@ -337,6 +341,7 @@ debug = "line-tables-only" [profile.release] panic = "abort" +lto = "thin" [profile.benchmarks] inherits = "release" diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env index 077c693c32..4349e54ebe 100644 --- a/golem-debugging-service/config/debug-worker-executor.sample.env +++ b/golem-debugging-service/config/debug-worker-executor.sample.env @@ -55,6 +55,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 +GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 @@ -228,6 +230,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 +GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml index 8ee03a9c23..01a81fd83a 100644 --- a/golem-debugging-service/config/debug-worker-executor.toml +++ b/golem-debugging-service/config/debug-worker-executor.toml @@ -96,6 +96,8 @@ max_oplog_query_pages_size = 100 [memory] acquire_retry_delay = "500ms" +component_size_coefficient = 2.0 +enable_measured_admission = true worker_estimate_coefficient = 1.1 worker_memory_ratio = 0.8 @@ -364,6 +366,8 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" +# component_size_coefficient = 2.0 +# enable_measured_admission = true # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 # diff --git a/golem-debugging-service/src/config.rs b/golem-debugging-service/src/config.rs index dc6299652b..6a9869550b 100644 --- a/golem-debugging-service/src/config.rs +++ b/golem-debugging-service/src/config.rs @@ -98,6 +98,7 @@ impl DebugConfig { max_in_function_retry_delay: std::time::Duration::from_secs(20), max_websocket_connections: 100, quota_service: QuotaServiceConfig::default(), + runtime_metrics_sampling_interval: std::time::Duration::from_secs(5), } } } diff --git a/golem-debugging-service/src/lib.rs b/golem-debugging-service/src/lib.rs index d6062f2cf1..2dea553b6b 100644 --- a/golem-debugging-service/src/lib.rs +++ b/golem-debugging-service/src/lib.rs @@ -375,13 +375,19 @@ pub async fn run_debug_worker_executor + ?Sized + Sen ) -> anyhow::Result { debug!("Initializing debug worker executor"); - let total_system_memory = golem_config.memory.total_system_memory(); - let system_memory = golem_config.memory.system_memory(); - let worker_memory = golem_config.memory.worker_memory(); + let memory_snapshot = + golem_worker_executor::services::active_workers::memory_probe::default_probe( + golem_config.memory.system_memory_override, + ) + .snapshot(); + let total_system_memory = memory_snapshot.limit_bytes; + let used_system_memory = memory_snapshot.current_bytes; + let worker_memory = + (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64; info!( - "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}", + "Measured memory limit: {}, Currently used: {}, Usable for workers: {}", ISizeFormatter::new(total_system_memory, humansize::BINARY), - ISizeFormatter::new(system_memory, humansize::BINARY), + ISizeFormatter::new(used_system_memory, humansize::BINARY), ISizeFormatter::new(worker_memory, humansize::BINARY) ); diff --git a/golem-service-base/src/observability.rs b/golem-service-base/src/observability.rs index 98a83dd36e..f9f54b554c 100644 --- a/golem-service-base/src/observability.rs +++ b/golem-service-base/src/observability.rs @@ -18,21 +18,42 @@ use axum::response::IntoResponse; use axum::routing::get; use http::Response; use prometheus::{Encoder, Registry, TextEncoder}; +use std::sync::Arc; use tokio::net::{TcpListener, ToSocketAddrs}; use tokio::task::JoinSet; use tracing::{Instrument, info}; +/// A callback that renders additional metrics in Prometheus text exposition +/// format, appended to the output of the `prometheus`-crate registry on the +/// `/metrics` endpoint. Used to surface metrics from a second metrics façade +/// (e.g. the `metrics`-crate recorder driving tokio-metrics) on the same +/// scrape endpoint. +pub type ExtraMetrics = Arc String + Send + Sync>; + pub async fn start_health_and_metrics_server( addr: impl ToSocketAddrs, registry: Registry, body_message: &'static str, join_set: &mut JoinSet>, +) -> Result { + start_health_and_metrics_server_with_extra(addr, registry, None, body_message, join_set).await +} + +pub async fn start_health_and_metrics_server_with_extra( + addr: impl ToSocketAddrs, + registry: Registry, + extra: Option, + body_message: &'static str, + join_set: &mut JoinSet>, ) -> Result { let app = Router::new() .route("/healthcheck", get(move || async move { body_message })) .route( "/metrics", - get(|| async move { prometheus_metrics(registry.clone()) }), + get(move || { + let extra = extra.clone(); + async move { prometheus_metrics(registry.clone(), extra) } + }), ); let listener = TcpListener::bind(addr).await?; @@ -51,13 +72,17 @@ pub async fn start_health_and_metrics_server( Ok(local_addr.port()) } -pub fn prometheus_metrics(registry: Registry) -> impl IntoResponse { +pub fn prometheus_metrics(registry: Registry, extra: Option) -> impl IntoResponse { let encoder = TextEncoder::new(); let mut buffer = Vec::new(); let metric_families = registry.gather(); encoder.encode(&metric_families, &mut buffer).unwrap(); + if let Some(extra) = extra { + buffer.extend_from_slice(extra().as_bytes()); + } + Response::builder() .header("Content-Type", encoder.format_type()) .body(Body::from(buffer)) diff --git a/golem-test-framework/src/benchmark/config.rs b/golem-test-framework/src/benchmark/config.rs index c011ac65b0..0d172baa24 100644 --- a/golem-test-framework/src/benchmark/config.rs +++ b/golem-test-framework/src/benchmark/config.rs @@ -116,7 +116,7 @@ pub struct BenchmarkSuiteItem { impl BenchmarkSuiteItem { pub fn runs(&self, mode: &TestMode) -> Vec { let cluster_size: Vec = match mode { - TestMode::Provided { .. } => { + TestMode::Provided { .. } | TestMode::Cloud { .. } => { vec![0] } _ => self @@ -163,3 +163,83 @@ impl BenchmarkSuiteItem { res } } + +/// Smoke tests for cloud-mode wiring that do not require running services. +/// +/// For a full end-to-end smoke test that exercises actual HTTP clients, +/// cleanup, and the benchmark API contract, run the binary directly against a +/// local Spawned cluster: +/// +/// ```text +/// cargo run --bin benchmarks -- benchmark cold-start-unknown-small \ +/// --size 1 --iterations 1 --length 0 \ +/// cloud \ +/// --api-url http://localhost:8081 \ +/// --apps-base-domain golem.cloud \ +/// --admin-account-id \ +/// --admin-account-email \ +/// --admin-account-token \ +/// --builtin-plugin-owner-account-id \ +/// --default-plan-id +/// ``` +#[cfg(test)] +mod cloud_mode_smoke { + use super::*; + use test_r::test; + use url::Url; + use uuid::Uuid; + + fn cloud_mode() -> TestMode { + TestMode::Cloud { + api_url: Url::parse("https://release.dev-api.golem.cloud").unwrap(), + apps_base_domain: "apps.dev.golem.cloud".to_string(), + admin_account_token: "test-token".to_string(), + builtin_plugin_owner_account_id: Uuid::nil(), + default_plan_id: Uuid::nil(), + shard_manager_grpc_host: None, + shard_manager_grpc_port: None, + component_directory: "test-components".to_string(), + } + } + + /// Cloud mode always returns exactly one `RunConfig` with `cluster_size=0`, + /// regardless of how many `cluster_size` values the suite item specifies. + #[test] + fn runs_returns_single_cluster_size_zero_run() { + let mode = cloud_mode(); + let item = BenchmarkSuiteItem { + name: "cold-start-unknown-small".to_string(), + iterations: 3, + cluster_size: vec![1, 3, 5], // must be ignored in cloud mode + size: vec![10], + length: vec![100], + disable_compilation_cache: None, + }; + let runs = item.runs(&mode); + assert_eq!(runs.len(), 1, "cloud mode ignores cluster_size variations"); + assert_eq!(runs[0].cluster_size, 0, "cloud mode cluster_size must be 0"); + assert_eq!(runs[0].size, 10); + assert_eq!(runs[0].length, 100); + } + + /// Multiple size and length combinations still expand normally; only + /// `cluster_size` is collapsed. + #[test] + fn runs_expands_size_and_length_but_not_cluster_size() { + let mode = cloud_mode(); + let item = BenchmarkSuiteItem { + name: "latency-small".to_string(), + iterations: 1, + cluster_size: vec![1, 3], + size: vec![5, 10], + length: vec![50, 100], + disable_compilation_cache: None, + }; + let runs = item.runs(&mode); + // 1 (collapsed cluster_size) × 2 sizes × 2 lengths = 4 runs + assert_eq!(runs.len(), 4); + for r in &runs { + assert_eq!(r.cluster_size, 0); + } + } +} diff --git a/golem-test-framework/src/benchmark/mod.rs b/golem-test-framework/src/benchmark/mod.rs index 1f349afddd..5e82adde15 100644 --- a/golem-test-framework/src/benchmark/mod.rs +++ b/golem-test-framework/src/benchmark/mod.rs @@ -16,7 +16,9 @@ mod config; mod results; pub use config::{BenchmarkConfig, BenchmarkSuite, BenchmarkSuiteItem, RunConfig}; -pub use results::{BenchmarkResult, BenchmarkRunResult, BenchmarkSuiteResult, ResultKey}; +pub use results::{ + BenchmarkResult, BenchmarkRunResult, BenchmarkSuiteResult, ResultKey, RunMetadata, +}; use crate::config::benchmark::TestMode; use async_trait::async_trait; @@ -301,6 +303,7 @@ impl BenchmarkApi for B { description: B::description().to_string(), runs, results, + run_id: None, } } } diff --git a/golem-test-framework/src/benchmark/results.rs b/golem-test-framework/src/benchmark/results.rs index 1cb0f329b6..afb7319a7d 100644 --- a/golem-test-framework/src/benchmark/results.rs +++ b/golem-test-framework/src/benchmark/results.rs @@ -484,6 +484,97 @@ impl Display for BenchmarkResultView { } } +/// Cloud-mode run metadata collected by the buildspec and passed via environment variables. +/// All fields are optional — missing env vars produce `None` rather than failing the run. +#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct RunMetadata { + /// The `golem-oss` commit SHA that was built and deployed. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub golem_oss_commit_sha: Option, + /// The `golem-cloud` (kubernetes manifests) commit SHA that was deployed. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub kubernetes_manifest_commit_sha: Option, + /// Number of Ready `worker-executor` pods observed at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub observed_cluster_size: Option, + /// Container image tag of the deployed `worker-executor`. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub worker_executor_image_tag: Option, + /// Container image tag of the deployed `registry-service`. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub registry_service_image_tag: Option, + /// Container image tag of the deployed `worker-service`. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub worker_service_image_tag: Option, + /// Aurora ACU capacity for the main (`golem_dev`) cluster at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub aurora_acu_main: Option, + /// Aurora ACU capacity for the indexed-storage cluster at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub aurora_acu_indexed: Option, + /// Aurora ACU capacity for the keyvalue-storage cluster at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub aurora_acu_keyvalue: Option, + /// Ready replica count for `worker-executor` at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub worker_executor_replicas: Option, + /// Ready replica count for `worker-service` at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub worker_service_replicas: Option, + /// Ready replica count for `registry-service` at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub registry_service_replicas: Option, + /// Ready replica count for `compilation-service` at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub compilation_service_replicas: Option, + /// Ready replica count for `debugging-service` at run start. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub debugging_service_replicas: Option, + /// Free-form note from the `workflow_dispatch` trigger. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub note: Option, +} + +impl RunMetadata { + /// Reads all `GOLEM_BENCH_*` environment variables and returns a populated + /// `RunMetadata`. Missing variables produce `None` for that field. + pub fn from_env() -> Self { + fn env_str(key: &str) -> Option { + std::env::var(key).ok().filter(|v| !v.is_empty()) + } + fn env_u32(key: &str) -> Option { + env_str(key).and_then(|v| v.parse().ok()) + } + fn env_f64(key: &str) -> Option { + env_str(key).and_then(|v| v.parse().ok()) + } + + Self { + golem_oss_commit_sha: env_str("GOLEM_BENCH_OSS_COMMIT_SHA"), + kubernetes_manifest_commit_sha: env_str("GOLEM_BENCH_K8S_MANIFEST_COMMIT_SHA"), + observed_cluster_size: env_u32("GOLEM_BENCH_OBSERVED_CLUSTER_SIZE"), + worker_executor_image_tag: env_str("GOLEM_BENCH_WORKER_EXECUTOR_IMAGE_TAG"), + registry_service_image_tag: env_str("GOLEM_BENCH_REGISTRY_SERVICE_IMAGE_TAG"), + worker_service_image_tag: env_str("GOLEM_BENCH_WORKER_SERVICE_IMAGE_TAG"), + aurora_acu_main: env_f64("GOLEM_BENCH_AURORA_ACU_MAIN"), + aurora_acu_indexed: env_f64("GOLEM_BENCH_AURORA_ACU_INDEXED"), + aurora_acu_keyvalue: env_f64("GOLEM_BENCH_AURORA_ACU_KEYVALUE"), + worker_executor_replicas: env_u32("GOLEM_BENCH_WORKER_EXECUTOR_REPLICAS"), + worker_service_replicas: env_u32("GOLEM_BENCH_WORKER_SERVICE_REPLICAS"), + registry_service_replicas: env_u32("GOLEM_BENCH_REGISTRY_SERVICE_REPLICAS"), + compilation_service_replicas: env_u32("GOLEM_BENCH_COMPILATION_SERVICE_REPLICAS"), + debugging_service_replicas: env_u32("GOLEM_BENCH_DEBUGGING_SERVICE_REPLICAS"), + note: env_str("GOLEM_BENCH_RUN_NOTE"), + } + } + + /// Returns `true` if every field is `None` (nothing was read from env). + pub fn is_empty(&self) -> bool { + self == &Self::default() + } +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct BenchmarkSuiteResultCollection { pub runs: Vec, @@ -491,10 +582,20 @@ pub struct BenchmarkSuiteResultCollection { #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct BenchmarkSuiteResult { + /// Result format version. Always `1` for results produced by this binary. + pub schema_version: u32, pub suite: String, pub environment: String, pub version: String, pub timestamp: DateTime, + /// Suite-level run-id. Set in cloud mode to `bench-{run_id}` to allow + /// cross-run correlation and garbage collection of orphaned state. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub run_id: Option, + /// Cloud-mode run metadata populated from `GOLEM_BENCH_*` environment variables. + /// `None` in Spawned or Provided modes where cluster metadata is not available. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub run_metadata: Option, pub results: Vec, } @@ -526,10 +627,13 @@ impl BenchmarkSuiteResult { ); Self { + schema_version: 1, suite: suite.to_string(), environment, version: golem_common::golem_version().to_string(), timestamp: Utc::now(), + run_id: None, + run_metadata: None, results: vec![], } } @@ -606,6 +710,10 @@ pub struct BenchmarkResult { pub description: String, pub runs: Vec, pub results: Vec, + /// Suite-level run-id. Set in cloud mode to `bench-{run_id}` to allow + /// cross-run correlation and garbage collection of orphaned state. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub run_id: Option, } impl BenchmarkResult { diff --git a/golem-test-framework/src/components/component_compilation_service/mod.rs b/golem-test-framework/src/components/component_compilation_service/mod.rs index f80d2f84d6..50da099698 100644 --- a/golem-test-framework/src/components/component_compilation_service/mod.rs +++ b/golem-test-framework/src/components/component_compilation_service/mod.rs @@ -21,6 +21,7 @@ use tracing::Level; pub mod provided; pub mod spawned; +pub mod unavailable; #[async_trait] pub trait ComponentCompilationService: Send + Sync { diff --git a/golem-test-framework/src/components/component_compilation_service/unavailable.rs b/golem-test-framework/src/components/component_compilation_service/unavailable.rs new file mode 100644 index 0000000000..fb355cd0b3 --- /dev/null +++ b/golem-test-framework/src/components/component_compilation_service/unavailable.rs @@ -0,0 +1,35 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::ComponentCompilationService; +use async_trait::async_trait; + +/// A `ComponentCompilationService` that is not directly reachable. Used in +/// cloud mode, where it is an internal cluster component with no external +/// exposure. `kill` is a no-op so that `kill_all()` completes; operational +/// methods panic with a clear message. +pub struct UnavailableComponentCompilationService; + +#[async_trait] +impl ComponentCompilationService for UnavailableComponentCompilationService { + fn grpc_host(&self) -> String { + panic!("component_compilation_service() is not available in cloud mode"); + } + + fn grpc_port(&self) -> u16 { + panic!("component_compilation_service() is not available in cloud mode"); + } + + async fn kill(&self) {} +} diff --git a/golem-test-framework/src/components/rdb/mod.rs b/golem-test-framework/src/components/rdb/mod.rs index 5f1b5c7fb8..ce8863c10e 100644 --- a/golem-test-framework/src/components/rdb/mod.rs +++ b/golem-test-framework/src/components/rdb/mod.rs @@ -29,6 +29,7 @@ pub mod docker_mysql; pub mod docker_postgres; pub mod provided_postgres; pub mod sqlite; +pub mod unavailable; #[async_trait] pub trait Rdb: Send + Sync { diff --git a/golem-test-framework/src/components/rdb/unavailable.rs b/golem-test-framework/src/components/rdb/unavailable.rs new file mode 100644 index 0000000000..1df99efe70 --- /dev/null +++ b/golem-test-framework/src/components/rdb/unavailable.rs @@ -0,0 +1,31 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{DbInfo, Rdb}; +use async_trait::async_trait; + +/// An `Rdb` that is not directly reachable. Used in cloud mode, where the +/// database is an internal cluster component with no external exposure. +/// Lifecycle teardown (`kill`) is a no-op so that `kill_all()` completes; +/// operational methods panic with a clear message. +pub struct UnavailableRdb; + +#[async_trait] +impl Rdb for UnavailableRdb { + fn info(&self) -> DbInfo { + panic!("rdb() is not available in cloud mode"); + } + + async fn kill(&self) {} +} diff --git a/golem-test-framework/src/components/redis/mod.rs b/golem-test-framework/src/components/redis/mod.rs index df14595c7a..62346ec293 100644 --- a/golem-test-framework/src/components/redis/mod.rs +++ b/golem-test-framework/src/components/redis/mod.rs @@ -20,6 +20,7 @@ use tracing::info; pub mod provided; pub mod spawned; pub mod spawned_tls; +pub mod unavailable; #[async_trait] pub trait Redis: Send + Sync { diff --git a/golem-test-framework/src/components/redis/unavailable.rs b/golem-test-framework/src/components/redis/unavailable.rs new file mode 100644 index 0000000000..0f24489fe9 --- /dev/null +++ b/golem-test-framework/src/components/redis/unavailable.rs @@ -0,0 +1,43 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::Redis; +use async_trait::async_trait; + +/// A `Redis` that is not directly reachable. Used in cloud mode, where Redis +/// is an internal cluster component with no external exposure. `kill` is a +/// no-op so that `kill_all()` completes; operational methods panic with a +/// clear message. +pub struct UnavailableRedis; + +#[async_trait] +impl Redis for UnavailableRedis { + fn assert_valid(&self) { + panic!("redis() is not available in cloud mode"); + } + + fn private_host(&self) -> String { + panic!("redis() is not available in cloud mode"); + } + + fn private_port(&self) -> u16 { + panic!("redis() is not available in cloud mode"); + } + + fn prefix(&self) -> &str { + panic!("redis() is not available in cloud mode"); + } + + async fn kill(&self) {} +} diff --git a/golem-test-framework/src/components/redis_monitor/mod.rs b/golem-test-framework/src/components/redis_monitor/mod.rs index eb73fe0e0d..2a24665ec5 100644 --- a/golem-test-framework/src/components/redis_monitor/mod.rs +++ b/golem-test-framework/src/components/redis_monitor/mod.rs @@ -13,6 +13,7 @@ // limitations under the License. pub mod spawned; +pub mod unavailable; pub trait RedisMonitor: Send + Sync { fn assert_valid(&self); diff --git a/golem-test-framework/src/components/redis_monitor/unavailable.rs b/golem-test-framework/src/components/redis_monitor/unavailable.rs new file mode 100644 index 0000000000..bdde53d231 --- /dev/null +++ b/golem-test-framework/src/components/redis_monitor/unavailable.rs @@ -0,0 +1,29 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::RedisMonitor; + +/// A `RedisMonitor` that is not directly reachable. Used in cloud mode, where +/// Redis is an internal cluster component with no external exposure. `kill` is +/// a no-op so that `kill_all()` completes; operational methods panic with a +/// clear message. +pub struct UnavailableRedisMonitor; + +impl RedisMonitor for UnavailableRedisMonitor { + fn assert_valid(&self) { + panic!("redis_monitor() is not available in cloud mode"); + } + + fn kill(&self) {} +} diff --git a/golem-test-framework/src/components/registry_service/cloud.rs b/golem-test-framework/src/components/registry_service/cloud.rs new file mode 100644 index 0000000000..79e5d03935 --- /dev/null +++ b/golem-test-framework/src/components/registry_service/cloud.rs @@ -0,0 +1,167 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::RegistryService; +use async_trait::async_trait; +use golem_client::api::RegistryServiceClientLive; +use golem_client::{Context, Security}; +use golem_common::model::account::{AccountEmail, AccountId}; +use golem_common::model::auth::TokenSecret; +use golem_common::model::plan::PlanId; +use std::time::Duration; +use tokio::sync::OnceCell; +use tracing::info; +use url::Url; + +/// Registry-service client for cloud mode. +/// +/// In the deployed Golem environment both registry-service and worker-service +/// are reachable behind a single Gateway API hostname +/// (e.g. `https://release.dev-api.golem.cloud`). This struct holds that shared +/// `api_url`; routing to the correct backend service is done by the Gateway +/// based on URL path. +pub struct CloudRegistryService { + api_url: Url, + admin_token: TokenSecret, + builtin_plugin_owner_account_id: AccountId, + default_plan_id: PlanId, + base_http_client: OnceCell, +} + +impl CloudRegistryService { + pub fn new( + api_url: Url, + admin_token: TokenSecret, + builtin_plugin_owner_account_id: AccountId, + default_plan_id: PlanId, + ) -> Self { + info!("Using cloud API gateway at {api_url}"); + Self { + api_url, + admin_token, + builtin_plugin_owner_account_id, + default_plan_id, + base_http_client: OnceCell::new(), + } + } +} + +/// Constructs the tuned HTTP client for cloud-mode benchmark connections. +/// +/// Settings: large connection pool (1024), 90-second idle timeout, TCP +/// nodelay, and 180-second request timeout. +/// +/// Note: `http2_prior_knowledge()` is deliberately **not** set. Prior +/// knowledge is for h2c (HTTP/2 over plain HTTP). All cloud endpoints are +/// HTTPS, where HTTP/2 is negotiated through ALPN during the TLS handshake +/// (TLS termination happens at Envoy). Setting prior knowledge would bypass +/// ALPN and can cause protocol errors. +pub fn new_cloud_reqwest_client() -> reqwest_middleware::ClientWithMiddleware { + let client = reqwest::ClientBuilder::new() + .pool_max_idle_per_host(1024) + .pool_idle_timeout(Duration::from_secs(90)) + .tcp_nodelay(true) + .timeout(Duration::from_secs(180)) + .build() + .expect("Failed to build cloud HTTP client"); + reqwest_middleware::ClientBuilder::new(client) + .with(reqwest_tracing::TracingMiddleware::default()) + .build() +} + +#[async_trait] +impl RegistryService for CloudRegistryService { + fn http_host(&self) -> String { + self.api_url.host_str().unwrap_or("localhost").to_string() + } + + fn http_port(&self) -> u16 { + self.api_url.port_or_known_default().unwrap_or(443) + } + + fn grpc_host(&self) -> String { + panic!("grpc_host() is not available through the Gateway in cloud mode"); + } + + fn grpc_port(&self) -> u16 { + panic!("grpc_port() is not available through the Gateway in cloud mode"); + } + + fn admin_account_id(&self) -> AccountId { + AccountId(uuid::Uuid::nil()) + } + + fn admin_account_email(&self) -> AccountEmail { + AccountEmail::new(String::new()) + } + + fn admin_account_token(&self) -> TokenSecret { + self.admin_token.clone() + } + + fn builtin_plugin_owner_account_id(&self) -> AccountId { + self.builtin_plugin_owner_account_id + } + + fn default_plan(&self) -> PlanId { + self.default_plan_id + } + + fn low_fuel_plan(&self) -> PlanId { + panic!( + "low_fuel_plan is not supported in cloud mode; \ + the benchmark calling this method requires a local or provided cluster" + ); + } + + fn low_disk_space_plan(&self) -> PlanId { + panic!( + "low_disk_space_plan is not supported in cloud mode; \ + the benchmark calling this method requires a local or provided cluster" + ); + } + + fn low_http_calls_plan(&self) -> PlanId { + panic!( + "low_http_calls_plan is not supported in cloud mode; \ + the benchmark calling this method requires a local or provided cluster" + ); + } + + fn low_rpc_calls_plan(&self) -> PlanId { + panic!( + "low_rpc_calls_plan is not supported in cloud mode; \ + the benchmark calling this method requires a local or provided cluster" + ); + } + + async fn kill(&self) {} + + async fn base_http_client(&self) -> reqwest_middleware::ClientWithMiddleware { + self.base_http_client + .get_or_init(|| async { new_cloud_reqwest_client() }) + .await + .clone() + } + + async fn client(&self, token: &TokenSecret) -> RegistryServiceClientLive { + RegistryServiceClientLive { + context: Context { + client: self.base_http_client().await, + base_url: self.api_url.clone(), + security_token: Security::Bearer(token.secret().to_string()), + }, + } + } +} diff --git a/golem-test-framework/src/components/registry_service/mod.rs b/golem-test-framework/src/components/registry_service/mod.rs index d38f88577d..42b0b9ddfd 100644 --- a/golem-test-framework/src/components/registry_service/mod.rs +++ b/golem-test-framework/src/components/registry_service/mod.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod cloud; pub mod provided; pub mod spawned; diff --git a/golem-test-framework/src/components/shard_manager/mod.rs b/golem-test-framework/src/components/shard_manager/mod.rs index 5245865e4b..91ed2ed2da 100644 --- a/golem-test-framework/src/components/shard_manager/mod.rs +++ b/golem-test-framework/src/components/shard_manager/mod.rs @@ -14,6 +14,7 @@ pub mod provided; pub mod spawned; +pub mod unavailable; use super::rdb::Rdb; use super::registry_service::RegistryService; @@ -30,7 +31,7 @@ use std::sync::Arc; use std::time::Duration; use tonic::codec::CompressionEncoding; use tonic::transport::Channel; -use tracing::Level; +use tracing::{Level, warn}; #[async_trait] pub trait ShardManager: Send + Sync { @@ -46,25 +47,30 @@ pub trait ShardManager: Send + Sync { async fn restart(&self, number_of_shards_override: Option); async fn get_routing_table(&self) -> crate::Result { - let routing_table = self - .client() - .await - .get_routing_table(GetRoutingTableRequest {}) - .await - .expect("Unable to fetch the routing table from shard-manager-service"); - - match routing_table.into_inner() { - shardmanager::v1::GetRoutingTableResponse { - result: - Some(shardmanager::v1::get_routing_table_response::Result::Success(routing_table)), - } => Ok(routing_table - .try_into() - .map_err(|e| anyhow!("Failed converting routing table: {e}"))?), - shardmanager::v1::GetRoutingTableResponse { - result: Some(shardmanager::v1::get_routing_table_response::Result::Failure(err)), - } => Err(anyhow!("Failed to get routing table: {err:?}")), - _ => Err(anyhow!("Failed to get routing table")), + // Retry with backoff to tolerate transient port-forward reconnects. + // The port-forward watchdog restarts in ~500ms, so 10 attempts with + // 1s delay gives ~10s of tolerance before giving up. + let max_attempts = 10; + let retry_delay = Duration::from_secs(1); + let mut last_err = anyhow!("get_routing_table: no attempts made"); + + for attempt in 1..=max_attempts { + match try_get_routing_table(&self.grpc_host(), self.grpc_port()).await { + Ok(rt) => return Ok(rt), + Err(err) => { + warn!( + attempt, + max_attempts, + error = %err, + "Failed to fetch routing table, retrying..." + ); + last_err = err; + tokio::time::sleep(retry_delay).await; + } + } } + + Err(last_err) } } @@ -76,6 +82,34 @@ async fn new_client(host: &str, grpc_port: u16) -> ShardManagerServiceClient crate::Result { + let mut client = ShardManagerServiceClient::connect(format!("http://{host}:{grpc_port}")) + .await + .map_err(|e| anyhow!("Failed to connect to shard-manager: {e}"))? + .send_compressed(CompressionEncoding::Gzip) + .accept_compressed(CompressionEncoding::Gzip); + + let routing_table = client + .get_routing_table(GetRoutingTableRequest {}) + .await + .map_err(|e| { + anyhow!("Unable to fetch the routing table from shard-manager-service: {e}") + })?; + + match routing_table.into_inner() { + shardmanager::v1::GetRoutingTableResponse { + result: + Some(shardmanager::v1::get_routing_table_response::Result::Success(routing_table)), + } => Ok(routing_table + .try_into() + .map_err(|e| anyhow!("Failed converting routing table: {e}"))?), + shardmanager::v1::GetRoutingTableResponse { + result: Some(shardmanager::v1::get_routing_table_response::Result::Failure(err)), + } => Err(anyhow!("Failed to get routing table: {err:?}")), + _ => Err(anyhow!("Failed to get routing table")), + } +} + async fn wait_for_startup( host: &str, grpc_port: u16, diff --git a/golem-test-framework/src/components/shard_manager/provided.rs b/golem-test-framework/src/components/shard_manager/provided.rs index d7e4ff1305..84d213a5fb 100644 --- a/golem-test-framework/src/components/shard_manager/provided.rs +++ b/golem-test-framework/src/components/shard_manager/provided.rs @@ -40,10 +40,10 @@ impl ShardManager for ProvidedShardManager { } async fn kill(&self) { - panic!("Cannot kill provided shard manager"); + // Nothing to do — we do not own this shard manager process. } async fn restart(&self, _number_of_shards_override: Option) { - panic!("Cannot restart provided shard manager"); + // Nothing to do — we do not own this shard manager process. } } diff --git a/golem-test-framework/src/components/shard_manager/unavailable.rs b/golem-test-framework/src/components/shard_manager/unavailable.rs new file mode 100644 index 0000000000..834dfb8d2c --- /dev/null +++ b/golem-test-framework/src/components/shard_manager/unavailable.rs @@ -0,0 +1,56 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::ShardManager; +use async_trait::async_trait; +use golem_common::model::RoutingTable; + +/// A `ShardManager` that is not directly reachable. Used in cloud mode when no +/// shard-manager port-forward is configured; pass `--shard-manager-grpc-host` +/// and `--shard-manager-grpc-port` to use a real `ProvidedShardManager` +/// instead. +/// +/// `kill`/`restart` are no-ops. `get_routing_table()` returns an error so that +/// callers (e.g. the throughput benchmark) can fall back to the unlabeled +/// single-bucket mode. The host/port accessors panic with a clear message. +pub struct UnavailableShardManager; + +#[async_trait] +impl ShardManager for UnavailableShardManager { + fn grpc_host(&self) -> String { + panic!( + "shard_manager() requires --shard-manager-grpc-host and \ + --shard-manager-grpc-port to be configured in cloud mode" + ); + } + + fn grpc_port(&self) -> u16 { + panic!( + "shard_manager() requires --shard-manager-grpc-host and \ + --shard-manager-grpc-port to be configured in cloud mode" + ); + } + + async fn kill(&self) {} + + async fn restart(&self, _number_of_shards_override: Option) {} + + async fn get_routing_table(&self) -> crate::Result { + Err(anyhow::anyhow!( + "shard_manager is not configured in cloud mode; \ + pass --shard-manager-grpc-host and --shard-manager-grpc-port \ + to enable routing table fetch and local/remote RPC labeling" + )) + } +} diff --git a/golem-test-framework/src/components/worker_executor_cluster/mod.rs b/golem-test-framework/src/components/worker_executor_cluster/mod.rs index 2dc8e21745..e1db10b237 100644 --- a/golem-test-framework/src/components/worker_executor_cluster/mod.rs +++ b/golem-test-framework/src/components/worker_executor_cluster/mod.rs @@ -18,6 +18,7 @@ use std::sync::Arc; pub mod provided; pub mod spawned; +pub mod unavailable; #[async_trait] pub trait WorkerExecutorCluster: Send + Sync { diff --git a/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs b/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs new file mode 100644 index 0000000000..53a5cc87be --- /dev/null +++ b/golem-test-framework/src/components/worker_executor_cluster/unavailable.rs @@ -0,0 +1,63 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::components::worker_executor::WorkerExecutor; +use crate::components::worker_executor_cluster::WorkerExecutorCluster; +use async_trait::async_trait; +use std::sync::Arc; + +/// A `WorkerExecutorCluster` whose individual executors are not directly +/// reachable. Used in cloud mode, where executors are internal cluster +/// components with no external exposure. +/// +/// Lifecycle teardown methods (`kill_all`, `restart_all`) are no-ops so that +/// `kill_all()` completes. `is_running()` returns `true` so that +/// `ensure_all_deps_running()` is a no-op. Per-executor operations panic with a +/// clear message. +pub struct UnavailableWorkerExecutorCluster; + +#[async_trait] +impl WorkerExecutorCluster for UnavailableWorkerExecutorCluster { + fn size(&self) -> usize { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + async fn kill_all(&self) {} + + async fn restart_all(&self) {} + + async fn stop(&self, _index: usize) { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + async fn start(&self, _index: usize) { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + fn to_vec(&self) -> Vec> { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + async fn stopped_indices(&self) -> Vec { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + async fn started_indices(&self) -> Vec { + panic!("worker_executor_cluster() is not available in cloud mode"); + } + + async fn is_running(&self) -> bool { + true + } +} diff --git a/golem-test-framework/src/components/worker_service/cloud.rs b/golem-test-framework/src/components/worker_service/cloud.rs new file mode 100644 index 0000000000..ceb60f4fbe --- /dev/null +++ b/golem-test-framework/src/components/worker_service/cloud.rs @@ -0,0 +1,113 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::components::registry_service::cloud::new_cloud_reqwest_client; +use crate::components::worker_service::WorkerService; +use async_trait::async_trait; +use golem_client::api::{AgentClientLive, WorkerClientLive}; +use golem_client::{Context, Security}; +use golem_common::model::auth::TokenSecret; +use tokio::sync::OnceCell; +use tracing::info; +use url::Url; + +/// Worker-service client for cloud mode. +/// +/// In the deployed Golem environment both registry-service and worker-service +/// are reachable behind a single Gateway API hostname +/// (e.g. `https://release.dev-api.golem.cloud`). This struct holds that shared +/// `api_url`; routing to worker-service is done by the Gateway based on URL +/// path (`/v1/components/*/workers/**`, `/v1/agents/**`). +pub struct CloudWorkerService { + api_url: Url, + base_http_client: OnceCell, +} + +impl CloudWorkerService { + pub fn new(api_url: Url) -> Self { + info!("Using cloud worker-service via API gateway at {api_url}"); + Self { + api_url, + base_http_client: OnceCell::new(), + } + } +} + +#[async_trait] +impl WorkerService for CloudWorkerService { + fn http_host(&self) -> String { + self.api_url.host_str().unwrap_or("localhost").to_string() + } + + fn http_port(&self) -> u16 { + self.api_url.port_or_known_default().unwrap_or(443) + } + + fn grpc_host(&self) -> String { + panic!("grpc_host() is not available through the Gateway in cloud mode"); + } + + fn gprc_port(&self) -> u16 { + panic!("gprc_port() is not available through the Gateway in cloud mode"); + } + + fn custom_request_host(&self) -> String { + // Code-first HTTP API deployments are reached via the apps base domain + // (*.apps.dev.golem.cloud), not through this host. + panic!("custom_request_host() is not available in cloud mode"); + } + + fn custom_request_port(&self) -> u16 { + // Code-first HTTP API deployments are reached via the apps base domain + // (*.apps.dev.golem.cloud), not through this port. + panic!("custom_request_port() is not available in cloud mode"); + } + + fn mcp_port(&self) -> u16 { + panic!("mcp_port() is not available in cloud mode"); + } + + async fn kill(&self) {} + + async fn base_http_client(&self) -> reqwest_middleware::ClientWithMiddleware { + self.base_http_client + .get_or_init(|| async { new_cloud_reqwest_client() }) + .await + .clone() + } + + /// Overrides the trait default to use the configured API gateway URL + /// (including scheme/TLS), rather than rebuilding `http://{host}:{port}`. + async fn worker_http_client(&self, token: &TokenSecret) -> WorkerClientLive { + WorkerClientLive { + context: Context { + client: self.base_http_client().await, + base_url: self.api_url.clone(), + security_token: Security::Bearer(token.secret().to_string()), + }, + } + } + + /// Overrides the trait default to use the configured API gateway URL + /// (including scheme/TLS), rather than rebuilding `http://{host}:{port}`. + async fn agent_http_client(&self, token: &TokenSecret) -> AgentClientLive { + AgentClientLive { + context: Context { + client: self.base_http_client().await, + base_url: self.api_url.clone(), + security_token: Security::Bearer(token.secret().to_string()), + }, + } + } +} diff --git a/golem-test-framework/src/components/worker_service/mod.rs b/golem-test-framework/src/components/worker_service/mod.rs index 6885e86696..126cc988c9 100644 --- a/golem-test-framework/src/components/worker_service/mod.rs +++ b/golem-test-framework/src/components/worker_service/mod.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod cloud; pub mod provided; pub mod spawned; diff --git a/golem-test-framework/src/config/benchmark.rs b/golem-test-framework/src/config/benchmark.rs index a1a304436b..34ac140d23 100644 --- a/golem-test-framework/src/config/benchmark.rs +++ b/golem-test-framework/src/config/benchmark.rs @@ -16,15 +16,21 @@ use crate::benchmark::BenchmarkConfig; use crate::components::component_compilation_service::ComponentCompilationService; use crate::components::component_compilation_service::provided::ProvidedComponentCompilationService; use crate::components::component_compilation_service::spawned::SpawnedComponentCompilationService; +use crate::components::component_compilation_service::unavailable::UnavailableComponentCompilationService; +use crate::components::rdb::PostgresInfo; +use crate::components::rdb::Rdb; use crate::components::rdb::docker_postgres::DockerPostgresRdb; use crate::components::rdb::provided_postgres::ProvidedPostgresRdb; -use crate::components::rdb::{PostgresInfo, Rdb}; +use crate::components::rdb::unavailable::UnavailableRdb; use crate::components::redis::Redis; use crate::components::redis::provided::ProvidedRedis; use crate::components::redis::spawned::SpawnedRedis; +use crate::components::redis::unavailable::UnavailableRedis; use crate::components::redis_monitor::RedisMonitor; use crate::components::redis_monitor::spawned::SpawnedRedisMonitor; +use crate::components::redis_monitor::unavailable::UnavailableRedisMonitor; use crate::components::registry_service::RegistryService; +use crate::components::registry_service::cloud::CloudRegistryService; use crate::components::registry_service::provided::ProvidedRegistryService; use crate::components::registry_service::spawned::SpawnedRegistryService; use crate::components::service::Service; @@ -32,10 +38,13 @@ use crate::components::service::spawned::SpawnedService; use crate::components::shard_manager::ShardManager; use crate::components::shard_manager::provided::ProvidedShardManager; use crate::components::shard_manager::spawned::SpawnedShardManager; +use crate::components::shard_manager::unavailable::UnavailableShardManager; use crate::components::worker_executor_cluster::WorkerExecutorCluster; use crate::components::worker_executor_cluster::provided::ProvidedWorkerExecutorCluster; use crate::components::worker_executor_cluster::spawned::SpawnedWorkerExecutorCluster; +use crate::components::worker_executor_cluster::unavailable::UnavailableWorkerExecutorCluster; use crate::components::worker_service::WorkerService; +use crate::components::worker_service::cloud::CloudWorkerService; use crate::components::worker_service::provided::ProvidedWorkerService; use crate::components::worker_service::spawned::SpawnedWorkerService; use crate::config::TestDependencies; @@ -51,11 +60,24 @@ use golem_service_base::storage::blob::BlobStorage; use golem_service_base::storage::blob::fs::FileSystemBlobStorage; use std::collections::HashMap; use std::path::{Path, PathBuf}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use tempfile::TempDir; use tracing::Level; +use url::Url; use uuid::Uuid; +/// Process-level UUID generated on the first cloud-mode benchmark context +/// creation. All cloud contexts within the same binary invocation share this +/// run-id, which is used to prefix account/app/env names +/// (`bench-{run_id}-…`) and written into result JSON metadata. +static CLOUD_BENCH_RUN_ID: OnceLock = OnceLock::new(); + +/// Returns the suite-level run-id if any cloud benchmark context has been +/// created in this process, `None` otherwise. +pub fn cloud_bench_run_id() -> Option { + CLOUD_BENCH_RUN_ID.get().copied() +} + /// Test dependencies created from command line arguments /// /// To be used when a single executable with an async entry point requires @@ -75,6 +97,12 @@ pub struct BenchmarkTestDependencies { component_directory: PathBuf, component_temp_directory: Arc, registry_service: Arc, + /// Set to `Some` in cloud mode. Used to prefix account/app/env names with + /// `bench-{run_id}-` so that orphaned state is traceable. + run_id: Option, + /// The apps base domain for cloud mode (e.g. `apps.golem.cloud`). Used to + /// construct HTTP API deployment domains as `{env_id}.{apps_base_domain}`. + apps_base_domain: Option, } #[derive(Parser, Debug, Clone)] @@ -222,6 +250,58 @@ pub enum TestMode { #[arg(long, default_value = "test-components")] component_directory: String, }, + /// Cloud mode: run benchmarks against a deployed Golem environment via + /// Gateway-API hostnames. No local service processes are spawned. + /// + /// All management API calls (registry-service, worker-service, agents) go + /// through a single Gateway hostname (`--api-url`). HTTP API deployment + /// access (code-first HTTP APIs) goes through `{env_id}.{apps_base_domain}`. + /// + /// For `golem-dev`: + /// `--api-url https://release.dev-api.golem.cloud` + /// `--apps-base-domain apps.dev.golem.cloud` + #[command()] + Cloud { + /// Base URL of the deployed Golem API Gateway. Both registry-service + /// and worker-service paths are routed internally by the Gateway. + /// + /// For the `golem-dev` environment this is + /// `https://release.dev-api.golem.cloud`. + #[arg(long)] + api_url: Url, + /// Wildcard base domain used to build per-environment HTTP API + /// deployment hostnames: `{env_id}.{apps_base_domain}`. + /// + /// For the `golem-dev` environment this is `apps.dev.golem.cloud`. + #[arg(long)] + apps_base_domain: String, + /// Bearer token for the admin account. Used to create a fresh user + /// account for each benchmark run, which then owns all benchmark state. + #[arg(long)] + admin_account_token: String, + /// UUID of the builtin-plugin-owner account. + /// Only needed for environment-plugin-grant tests; benchmarks do not + /// use it so the default (nil UUID) is fine for benchmark runs. + #[arg(long, default_value_t = Uuid::nil())] + builtin_plugin_owner_account_id: Uuid, + /// UUID of the default plan on the target cluster. + /// Only needed for environment-plugin-grant tests; benchmarks do not + /// use it so the default (nil UUID) is fine for benchmark runs. + #[arg(long, default_value_t = Uuid::nil())] + default_plan_id: Uuid, + /// Optional shard-manager gRPC hostname for a kubectl port-forward + /// (e.g. `localhost`). When set together with + /// `--shard-manager-grpc-port`, the throughput benchmark fetches the + /// routing table and labels RPC pairs as local/remote. + #[arg(long)] + shard_manager_grpc_host: Option, + /// Optional shard-manager gRPC port (e.g. `9090`). + #[arg(long)] + shard_manager_grpc_port: Option, + /// Directory containing test WASM component files. + #[arg(long, default_value = "test-components")] + component_directory: String, + }, } impl BenchmarkTestDependencies { @@ -419,6 +499,8 @@ impl BenchmarkTestDependencies { initial_agent_files_service, component_temp_directory: Arc::new(TempDir::new().unwrap()), registry_service, + run_id: None, + apps_base_domain: None, } } @@ -542,6 +624,8 @@ impl BenchmarkTestDependencies { initial_agent_files_service, component_temp_directory: Arc::new(TempDir::new().unwrap()), registry_service, + run_id: None, + apps_base_domain: None, } } TestMode::Spawned { @@ -590,17 +674,93 @@ impl BenchmarkTestDependencies { ) .await } + TestMode::Cloud { + api_url, + apps_base_domain, + admin_account_token, + builtin_plugin_owner_account_id, + default_plan_id, + shard_manager_grpc_host, + shard_manager_grpc_port, + component_directory, + } => { + let blob_storage = Arc::new( + FileSystemBlobStorage::new( + &std::env::temp_dir().join("golem-bench-blob-storage"), + ) + .await + .unwrap(), + ); + let initial_agent_files_service = + Arc::new(InitialAgentFilesService::new(blob_storage.clone())); + + // Use the process-level run_id (shared across all cloud contexts in + // this process so all benchmarks in a suite carry the same run ID). + let run_id = *CLOUD_BENCH_RUN_ID.get_or_init(Uuid::new_v4); + tracing::info!("Cloud benchmark run_id: {run_id}"); + + // Both registry-service and worker-service are reachable via the + // same Gateway hostname; routing is path-based. + let registry_service: Arc = + Arc::new(CloudRegistryService::new( + api_url.clone(), + TokenSecret::trusted(admin_account_token.clone()), + AccountId(*builtin_plugin_owner_account_id), + PlanId(*default_plan_id), + )); + + let shard_manager: Arc = + match (shard_manager_grpc_host, shard_manager_grpc_port) { + (Some(host), Some(port)) => { + Arc::new(ProvidedShardManager::new(host.clone(), 0, *port)) + } + _ => Arc::new(UnavailableShardManager), + }; + + let worker_service: Arc = + Arc::new(CloudWorkerService::new(api_url.clone())); + + Self { + rdb: Arc::new(UnavailableRdb), + redis: Arc::new(UnavailableRedis), + redis_monitor: Arc::new(UnavailableRedisMonitor), + shard_manager, + component_compilation_service: Arc::new(UnavailableComponentCompilationService), + worker_service, + worker_executor_cluster: Arc::new(UnavailableWorkerExecutorCluster), + component_directory: Path::new(component_directory).to_path_buf(), + blob_storage, + initial_agent_files_service, + component_temp_directory: Arc::new(TempDir::new().unwrap()), + registry_service, + run_id: Some(run_id), + apps_base_domain: Some(apps_base_domain.clone()), + } + } } } - /// Checks if all the spawned dependencies are still running, and if not, panicks + /// Checks if all the spawned dependencies are still running, and if not, panics. /// /// This can be used as a checkpoint in benchmarks to avoid infinite retries. + /// In cloud mode this is a no-op — the cloud cluster is assumed to be + /// managed externally. pub async fn ensure_all_deps_running(&self) { if !self.worker_executor_cluster.is_running().await { panic!("Worker executor process(es) stopped"); } } + + /// Returns the run-id for this benchmark context, if running in cloud mode. + /// Used to prefix accounts/apps/envs with `bench-{run_id}-`. + pub fn run_id(&self) -> Option { + self.run_id + } + + /// Returns the apps base domain for cloud mode (e.g. `apps.golem.cloud`). + pub fn apps_base_domain(&self) -> Option<&str> { + self.apps_base_domain.as_deref() + } } #[async_trait] @@ -652,6 +812,10 @@ impl TestDependencies for BenchmarkTestDependencies { fn registry_service(&self) -> Arc { self.registry_service.clone() } + + fn bench_name_prefix(&self) -> Option { + self.run_id.map(|id| format!("bench-{id}-")) + } } #[allow(dead_code)] diff --git a/golem-test-framework/src/config/dsl_impl.rs b/golem-test-framework/src/config/dsl_impl.rs index b228a5235e..f2d5472175 100644 --- a/golem-test-framework/src/config/dsl_impl.rs +++ b/golem-test-framework/src/config/dsl_impl.rs @@ -883,8 +883,9 @@ impl TestDslExtended for TestUserContext { environment_options: &EnvironmentOptions, ) -> anyhow::Result<(Application, Environment)> { let client = self.registry_service_client().await; - let app_name = ApplicationName(format!("app-{}", Uuid::new_v4())); - let env_name = EnvironmentName(format!("env-{}", Uuid::new_v4())); + let prefix = self.deps.bench_name_prefix().unwrap_or_default(); + let app_name = ApplicationName(format!("{prefix}app-{}", Uuid::new_v4())); + let env_name = EnvironmentName(format!("{prefix}env-{}", Uuid::new_v4())); let application = client .create_application( diff --git a/golem-test-framework/src/config/mod.rs b/golem-test-framework/src/config/mod.rs index f5c14ace60..d8bdbe6b39 100644 --- a/golem-test-framework/src/config/mod.rs +++ b/golem-test-framework/src/config/mod.rs @@ -56,6 +56,13 @@ pub trait TestDependencies: Send + Sync + Clone { fn initial_agent_files_service(&self) -> Arc; fn registry_service(&self) -> Arc; + /// Returns an optional name prefix applied to benchmark-created accounts, + /// applications, and environments. Non-`None` in cloud mode, where the + /// prefix is `bench-{run_id}-` to make orphaned state traceable. + fn bench_name_prefix(&self) -> Option { + None + } + async fn admin(&self) -> TestUserContext where Self: Sized, @@ -82,7 +89,12 @@ pub trait TestDependencies: Send + Sync + Clone { .client(®istry_service.admin_account_token()) .await; - let name = Uuid::new_v4().to_string(); + let uuid = Uuid::new_v4().to_string(); + let name = if let Some(prefix) = self.bench_name_prefix() { + format!("{prefix}{uuid}") + } else { + uuid + }; let account_data = AccountCreation { email: AccountEmail::new(format!("{name}@golem.cloud")), name, diff --git a/golem-worker-executor-test-utils/src/lib.rs b/golem-worker-executor-test-utils/src/lib.rs index ee81b41531..6d86ffb7a7 100644 --- a/golem-worker-executor-test-utils/src/lib.rs +++ b/golem-worker-executor-test-utils/src/lib.rs @@ -82,6 +82,7 @@ use golem_worker_executor::preview2::golem::agent::host::{ }; use golem_worker_executor::preview2::{golem_api_1_x, golem_durability}; use golem_worker_executor::services::active_workers::ActiveWorkers; +use golem_worker_executor::services::active_workers::memory_probe::FixedProbe; use golem_worker_executor::services::agent_types::AgentTypesService; use golem_worker_executor::services::agent_webhooks::AgentWebhooksService; use golem_worker_executor::services::blob_store::{ @@ -533,6 +534,16 @@ fn make_base_test_config(deps: &WorkerExecutorTestDependencies) -> GolemConfig { // without attempting a gRPC connection to a registry service that does // not exist in this test setup. resource_limits: ResourceLimitsConfig::Disabled(ResourceLimitsDisabledConfig {}), + // The measured-headroom admission gate requires the executor to own its + // memory environment (cgroup/process). The in-process test harness runs + // the executor alongside the test framework and other services, so the + // probe cannot isolate this executor's footprint. Disable the gate so + // admission always proceeds and tests are not subject to a memory limit + // derived from the shared host. + memory: MemoryConfig { + enable_measured_admission: false, + ..Default::default() + }, ..Default::default() } } @@ -696,6 +707,16 @@ pub async fn start_customized( apply_sqlite_storage_config(&mut config, deps, context); config.memory = MemoryConfig { system_memory_override, + // Enable the measured-headroom gate when a test pins a memory limit, so + // memory-pressure tests exercise the real admission controller under that + // limit. The test bootstrap (create_active_workers) feeds the gate a + // fixed probe reporting this limit with zero current usage, so admission + // is decided on the granted accounting against the pinned limit and is + // not perturbed by the shared test process's RSS. Otherwise the gate is + // disabled (see make_base_test_config). The usable ratio + // (worker_memory_ratio, default 0.8) applies, matching the pre-gate + // semaphore pool size of system_memory_override * ratio. + enable_measured_admission: system_memory_override.is_some(), ..Default::default() }; config.filesystem_storage = FilesystemStorageConfig { @@ -1358,6 +1379,31 @@ impl InvocationContextManagement for TestWorkerCtx { #[async_trait] impl Bootstrap for TestServerBootstrap { + fn create_active_workers( + &self, + golem_config: &GolemConfig, + ) -> Arc> { + // The in-process test harness shares its process (and RSS) with the test + // framework and other services, so a process-RSS probe cannot isolate + // this executor's footprint. When a test pins a memory limit via + // system_memory_override, give the gate a fixed probe reporting that + // limit with zero current usage, so admission is decided solely on the + // granted accounting (exact and process-isolated) against the pinned + // limit. The usable_ratio (worker_memory_ratio) still applies, matching + // the pre-gate semaphore pool size of system_memory_override * ratio. + match golem_config.memory.system_memory_override { + Some(limit) => Arc::new(ActiveWorkers::new_with_probe( + Box::new(FixedProbe::new(limit, 0)), + &golem_config.memory, + &golem_config.filesystem_storage, + )), + None => Arc::new(ActiveWorkers::new( + &golem_config.memory, + &golem_config.filesystem_storage, + )), + } + } + fn create_shard_manager_service( &self, _shard_manager_client: Arc, diff --git a/golem-worker-executor/Cargo.toml b/golem-worker-executor/Cargo.toml index bfe2dcafe8..9e83a12c1f 100644 --- a/golem-worker-executor/Cargo.toml +++ b/golem-worker-executor/Cargo.toml @@ -73,7 +73,10 @@ lazy_static = { workspace = true } log = { workspace = true } mac_address = { workspace = true, features = ["serde"] } md5 = { workspace = true } +metrics = { workspace = true } +metrics-exporter-prometheus = { workspace = true } metrohash = { workspace = true } +mimalloc = { workspace = true } nonempty-collections = { workspace = true } nonzero_ext = { workspace = true } pgvector = { workspace = true } @@ -92,6 +95,7 @@ sqlx-core = { workspace = true } sysinfo = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } +tokio-metrics = { workspace = true } tokio-stream = { workspace = true } tokio-tungstenite = { workspace = true } tokio-util = { workspace = true } diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env index 2a52884966..35725ab38f 100644 --- a/golem-worker-executor/config/worker-executor.sample.env +++ b/golem-worker-executor/config/worker-executor.sample.env @@ -4,6 +4,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0" GOLEM__HTTP_PORT=8082 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s" GOLEM__MAX_WEBSOCKET_CONNECTIONS=100 +GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s" GOLEM__TRACING_FILE_NAME_WITH_PORT=true GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25 GOLEM__ACTIVE_WORKERS__TTL="8h" @@ -72,6 +73,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 +GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 @@ -213,6 +216,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0" GOLEM__HTTP_PORT=8082 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s" GOLEM__MAX_WEBSOCKET_CONNECTIONS=100 +GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s" GOLEM__TRACING_FILE_NAME_WITH_PORT=true GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25 GOLEM__ACTIVE_WORKERS__TTL="8h" @@ -291,6 +295,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 +GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 @@ -432,6 +438,7 @@ GOLEM__HTTP_ADDRESS="0.0.0.0" GOLEM__HTTP_PORT=8082 GOLEM__MAX_IN_FUNCTION_RETRY_DELAY="20s" GOLEM__MAX_WEBSOCKET_CONNECTIONS=100 +GOLEM__RUNTIME_METRICS_SAMPLING_INTERVAL="5s" GOLEM__TRACING_FILE_NAME_WITH_PORT=true GOLEM__ACTIVE_WORKERS__DROP_WHEN_FULL=0.25 GOLEM__ACTIVE_WORKERS__TTL="8h" @@ -480,6 +487,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024 GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024 GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100 GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms" +GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0 +GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true #GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE= GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1 GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8 diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml index 4c89275519..df58d45b50 100644 --- a/golem-worker-executor/config/worker-executor.toml +++ b/golem-worker-executor/config/worker-executor.toml @@ -3,6 +3,7 @@ http_address = "0.0.0.0" http_port = 8082 max_in_function_retry_delay = "20s" max_websocket_connections = 100 +runtime_metrics_sampling_interval = "5s" tracing_file_name_with_port = true [active_workers] @@ -125,6 +126,8 @@ max_oplog_query_pages_size = 100 [memory] acquire_retry_delay = "500ms" +component_size_coefficient = 2.0 +enable_measured_admission = true worker_estimate_coefficient = 1.1 worker_memory_ratio = 0.8 @@ -331,6 +334,7 @@ without_time = false # http_port = 8082 # max_in_function_retry_delay = "20s" # max_websocket_connections = 100 +# runtime_metrics_sampling_interval = "5s" # tracing_file_name_with_port = true # # [active_workers] @@ -456,6 +460,8 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" +# component_size_coefficient = 2.0 +# enable_measured_admission = true # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 # @@ -661,6 +667,7 @@ without_time = false # http_port = 8082 # max_in_function_retry_delay = "20s" # max_websocket_connections = 100 +# runtime_metrics_sampling_interval = "5s" # tracing_file_name_with_port = true # # [active_workers] @@ -757,6 +764,8 @@ without_time = false # # [memory] # acquire_retry_delay = "500ms" +# component_size_coefficient = 2.0 +# enable_measured_admission = true # worker_estimate_coefficient = 1.1 # worker_memory_ratio = 0.8 # diff --git a/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt b/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt new file mode 100644 index 0000000000..eb12d21790 --- /dev/null +++ b/golem-worker-executor/proptest-regressions/services/active_workers/admission/tests.txt @@ -0,0 +1,9 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc b49eb145c9dca28d347382d8e482bb2cb6c5d256ccaba7532b370fbadc2bb3fb # shrinks to (limit, residents) = (500, []), schedule = [Admit(220), Admit(92), Admit(189)] +cc 9727f7e7aab54f8f48e6b856f9d70428fd8503767677fa7c232e27263273e071 # shrinks to limit = 815, schedule = [Grant(485), Grant(1), Grant(7), Exit(1), Grant(1), FaultIn(2, 1), Grant(40), Exit(2), Grant(284)] +cc 41321d47abd75b283d651e63e40c0f5191b680b908c05879c02d5f36b70de66c # shrinks to (limit, residents) = (1369, [Resident { size: 144, priority: Idle }, Resident { size: 228, priority: Warm }, Resident { size: 152, priority: Warm }, Resident { size: 101, priority: Idle }, Resident { size: 68, priority: Warm }, Resident { size: 45, priority: Idle }, Resident { size: 30, priority: Idle }, Resident { size: 20, priority: Idle }, Resident { size: 13, priority: Warm }, Resident { size: 9, priority: Idle }, Resident { size: 6, priority: Idle }]), schedule = [Admit(270), Admit(785), Admit(250), Admit(146), Admit(456)] diff --git a/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt b/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt new file mode 100644 index 0000000000..5845bf0e72 --- /dev/null +++ b/golem-worker-executor/proptest-regressions/services/active_workers/tests.txt @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 25407766c98e9d718173e44b5321f97049eea6d6d7737aad80a937d7230d67d9 # shrinks to limit = 1, ops = [Acquire, Acquire, CancelPending(Index(423873604949)), Acquire, ReleaseThenCancel(Index(2899867607303593255), Index(13233034632676646474))] diff --git a/golem-worker-executor/src/identity.rs b/golem-worker-executor/src/identity.rs new file mode 100644 index 0000000000..e2f95b0cae --- /dev/null +++ b/golem-worker-executor/src/identity.rs @@ -0,0 +1,32 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Process/pod identity for this worker-executor instance. +//! +//! The identity is derived from the `POD_NAME` env var, falling back to +//! `HOSTNAME`, then `"unknown"`, resolved once and cached for the lifetime of +//! the process. It is used both as the `executor_id` metric label and anywhere +//! else the running instance needs to identify itself. + +/// Returns the stable identity of this worker-executor instance. +/// +/// Resolved once on first call and cached for the lifetime of the process. +pub fn executor_id() -> &'static str { + static EXECUTOR_ID: std::sync::OnceLock = std::sync::OnceLock::new(); + EXECUTOR_ID.get_or_init(|| { + std::env::var("POD_NAME") + .or_else(|_| std::env::var("HOSTNAME")) + .unwrap_or_else(|_| "unknown".to_string()) + }) +} diff --git a/golem-worker-executor/src/lib.rs b/golem-worker-executor/src/lib.rs index a62f944cf0..b9ecc4f640 100644 --- a/golem-worker-executor/src/lib.rs +++ b/golem-worker-executor/src/lib.rs @@ -16,6 +16,7 @@ pub mod bootstrap; pub mod config; pub mod durable_host; pub mod grpc; +pub mod identity; pub mod metrics; pub mod model; pub mod preview2; @@ -161,6 +162,18 @@ impl Drop for RunDetails { #[async_trait] #[allow(clippy::too_many_arguments)] pub trait Bootstrap { + /// Creates the [`ActiveWorkers`] service, including the measured-headroom + /// admission gate. The default builds the memory probe from the config + /// (cgroup/process/override). The in-process test harness overrides this to + /// inject a probe with a pinned limit and usage so the gate is deterministic + /// and isolated from the shared test process's RSS. + fn create_active_workers(&self, golem_config: &GolemConfig) -> Arc> { + Arc::new(ActiveWorkers::::new( + &golem_config.memory, + &golem_config.filesystem_storage, + )) + } + fn create_shard_manager_service( &self, shard_manager_client: Arc, @@ -769,10 +782,7 @@ pub async fn create_worker_executor_impl< } }; - let active_workers = Arc::new(ActiveWorkers::::new( - &golem_config.memory, - &golem_config.filesystem_storage, - )); + let active_workers = bootstrap.create_active_workers(&golem_config); let file_loader = Arc::new(FileLoader::new( initial_files_service.clone(), @@ -1000,13 +1010,18 @@ pub async fn bootstrap_and_run_worker_executor< ) -> anyhow::Result { debug!("Initializing worker executor"); - let total_system_memory = golem_config.memory.total_system_memory(); - let system_memory = golem_config.memory.system_memory(); - let worker_memory = golem_config.memory.worker_memory(); + let memory_snapshot = crate::services::active_workers::memory_probe::default_probe( + golem_config.memory.system_memory_override, + ) + .snapshot(); + let total_system_memory = memory_snapshot.limit_bytes; + let used_system_memory = memory_snapshot.current_bytes; + let worker_memory = + (total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64; info!( - "Total system memory: {}, Available system memory: {}, Total memory available for workers: {}", + "Measured memory limit: {}, Currently used: {}, Usable for workers: {}", ISizeFormatter::new(total_system_memory, BINARY), - ISizeFormatter::new(system_memory, BINARY), + ISizeFormatter::new(used_system_memory, BINARY), ISizeFormatter::new(worker_memory, BINARY) ); @@ -1047,11 +1062,18 @@ pub async fn bootstrap_and_run_worker_executor< let leak_detector = worker_executor_impl.leak_detector(); + let runtime_metrics = crate::metrics::runtime::install_runtime_metrics( + runtime.clone(), + golem_config.runtime_metrics_sampling_interval, + join_set, + ); + let grpc_port = run_grpc_server(worker_executor_impl, lazy_worker_activator, join_set).await?; - let http_port = golem_service_base::observability::start_health_and_metrics_server( + let http_port = golem_service_base::observability::start_health_and_metrics_server_with_extra( golem_config.http_addr()?, prometheus_registry, + runtime_metrics, "Worker executor is running", join_set, ) diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs index 768b3e6b98..980ae3842d 100644 --- a/golem-worker-executor/src/metrics.rs +++ b/golem-worker-executor/src/metrics.rs @@ -69,18 +69,26 @@ const SCHEDULER_LAG_BUCKETS: &[f64; 11] = &[ 0.001, 0.01, 0.1, 1.0, 5.0, 15.0, 30.0, 60.0, 120.0, 300.0, 600.0, ]; -const MEMORY_SIZE_BUCKETS: &[f64; 11] = &[ - 1024.0, - 4096.0, - 16384.0, - 65536.0, - 262144.0, - 1048576.0, - 4194304.0, - 16777216.0, - 67108864.0, - 268435456.0, - 1073741824.0, +/// Buckets for the size of a single `memory.grow` allocation. Deliberately +/// fine-grained in the 1-32 MiB band where typical guest grows cluster, so +/// that p90/p99 quantiles are not pinned to a coarse 4-16 MiB bucket edge. +const MEMORY_SIZE_BUCKETS: &[f64; 16] = &[ + 65536.0, // 64 KiB + 262144.0, // 256 KiB + 1048576.0, // 1 MiB + 2097152.0, // 2 MiB + 4194304.0, // 4 MiB + 6291456.0, // 6 MiB + 8388608.0, // 8 MiB + 12582912.0, // 12 MiB + 16777216.0, // 16 MiB + 25165824.0, // 24 MiB + 33554432.0, // 32 MiB + 67108864.0, // 64 MiB + 134217728.0, // 128 MiB + 268435456.0, // 256 MiB + 536870912.0, // 512 MiB + 1073741824.0, // 1 GiB ]; pub mod component { @@ -105,6 +113,83 @@ pub mod component { } } +pub mod runtime { + use std::sync::Arc; + use std::time::Duration; + + use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle}; + use tokio::runtime::Handle; + use tokio::task::JoinSet; + use tokio_metrics::RuntimeMetricsReporterBuilder; + + /// How often the recorder's upkeep runs to keep its internal storage + /// bounded (e.g. pruning idle metrics once an idle timeout is configured). + const UPKEEP_INTERVAL: Duration = Duration::from_secs(30); + + /// Installs a dedicated `metrics`-crate Prometheus recorder for tokio + /// runtime metrics, spawns the tokio-metrics reporter on `join_set`, and + /// returns a renderer that emits the collected metrics in Prometheus text + /// format. + /// + /// `sampling_interval` controls how often metrics are sampled from the + /// runtime into the recorder; Prometheus scrapes the rendered values + /// independently. + /// + /// The returned closure is appended to the `prometheus`-crate scrape output + /// on the shared `/metrics` endpoint, so all `tokio_*` series appear on the + /// same endpoint as the rest of the executor's metrics, carrying the same + /// `executor_id` label. + /// + /// Returns `None` if a global `metrics` recorder is already installed (which + /// should not happen in the executor), in which case runtime metrics are + /// simply not exported. + pub fn install_runtime_metrics( + runtime: Handle, + sampling_interval: Duration, + join_set: &mut JoinSet>, + ) -> Option String + Send + Sync>> { + let executor_id = crate::identity::executor_id(); + + let handle: PrometheusHandle = match PrometheusBuilder::new() + .add_global_label("executor_id", executor_id) + .install_recorder() + { + Ok(handle) => handle, + Err(err) => { + tracing::warn!( + "Failed to install tokio runtime metrics recorder, runtime metrics will not be exported: {err}" + ); + return None; + } + }; + + let reporter = RuntimeMetricsReporterBuilder::default().with_interval(sampling_interval); + join_set.spawn_on( + async move { + reporter.describe_and_run().await; + Ok(()) + }, + &runtime, + ); + + // Run periodic upkeep so the recorder's internal storage stays bounded. + let upkeep_handle = handle.clone(); + join_set.spawn_on( + async move { + let mut interval = tokio::time::interval(UPKEEP_INTERVAL); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + loop { + interval.tick().await; + upkeep_handle.run_upkeep(); + } + }, + &runtime, + ); + + Some(Arc::new(move || handle.render())) + } +} + pub mod events { use lazy_static::lazy_static; use prometheus::*; @@ -182,6 +267,61 @@ pub mod workers { crate::metrics::BLOB_SIZE_BUCKETS.to_vec() ) .unwrap(); + pub static ref WORKER_MEMORY_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!( + "golem_worker_memory_pool_total_bytes", + "Usable memory ceiling (usable_ratio * measured limit) the admission gate admits against on this executor", + &["executor_id"] + ) + .unwrap(); + pub static ref WORKER_MEMORY_POOL_USED_BYTES: GaugeVec = register_gauge_vec!( + "golem_worker_memory_pool_used_bytes", + "Total linear memory granted to live workers and reserved by the admission gate on this executor", + &["executor_id"] + ) + .unwrap(); + pub static ref WORKER_ADMISSION_RSS_BYTES: GaugeVec = register_gauge_vec!( + "golem_worker_admission_rss_bytes", + "Measured resident memory (probe snapshot) the admission gate last read on this executor", + &["executor_id"] + ) + .unwrap(); + pub static ref WORKER_MEMORY_GROW_REJECTED_TOTAL: CounterVec = register_counter_vec!( + "golem_worker_memory_grow_rejected_total", + "Invocations interrupted because a worker's linear-memory grow could not be admitted by the gate (out-of-memory trap, retried via reacquire)", + &["executor_id"] + ) + .unwrap(); + } + + /// Counts one invocation interrupted because a linear-memory grow was + /// refused by the admission gate (the worker traps out-of-memory and is + /// restarted to reacquire memory). + pub fn record_worker_memory_grow_rejected() { + WORKER_MEMORY_GROW_REJECTED_TOTAL + .with_label_values(&[crate::metrics::storage::executor_id()]) + .inc(); + } + + /// Sets the gate's usable memory ceiling gauge. + pub fn record_worker_memory_ceiling(bytes: u64) { + WORKER_MEMORY_POOL_TOTAL_BYTES + .with_label_values(&[crate::metrics::storage::executor_id()]) + .set(bytes as f64); + } + + /// Sets the gauge of total memory granted to live workers (the gate's + /// reservation). + pub fn record_worker_memory_granted(bytes: u64) { + WORKER_MEMORY_POOL_USED_BYTES + .with_label_values(&[crate::metrics::storage::executor_id()]) + .set(bytes as f64); + } + + /// Sets the gauge of measured resident memory last read by the gate. + pub fn record_worker_admission_rss(bytes: u64) { + WORKER_ADMISSION_RSS_BYTES + .with_label_values(&[crate::metrics::storage::executor_id()]) + .set(bytes as f64); } pub fn record_worker_call(api_name: &'static str) { @@ -221,6 +361,9 @@ pub mod workers { WORKER_WAITING_FOR_MEMORY_COUNT .with_label_values(&[id]) .set(0.0); + WORKER_MEMORY_GROW_REJECTED_TOTAL + .with_label_values(&[id]) + .inc_by(0.0); } pub fn inc_worker_memory_resident() { @@ -294,18 +437,6 @@ pub mod workers { WORKER_FILESYSTEM_SEMAPHORE_AVAILABLE.add(permits.into_f64()); } - /// Records acquisition of `bytes` from the worker-memory pool. - /// Updates `golem_worker_memory_pool_used_bytes{executor_id}`. - pub fn record_memory_permit_acquired(bytes: usize) { - crate::metrics::storage::record_worker_memory_pool_acquired(bytes as u64); - } - - /// Records release of `bytes` back to the worker-memory pool. - /// Updates `golem_worker_memory_pool_used_bytes{executor_id}`. - pub fn record_memory_permit_released(bytes: usize) { - crate::metrics::storage::record_worker_memory_pool_released(bytes as u64); - } - pub fn record_worker_kv_cache_value_size(bytes: usize) { WORKER_KV_CACHE_VALUE_SIZE_BYTES .with_label_values(&[crate::metrics::storage::executor_id()]) @@ -504,7 +635,13 @@ pub mod wasm { .unwrap(); static ref ALLOCATED_MEMORY_BYTES: Histogram = register_histogram!( "allocated_memory_bytes", - "Amount of memory allocated by a single memory.grow instruction", + "Worker's total linear memory size after a memory.grow, sampled at each grow", + crate::metrics::MEMORY_SIZE_BUCKETS.to_vec() + ) + .unwrap(); + static ref WORKER_RESIDENT_LINEAR_MEMORY_BYTES: Histogram = register_histogram!( + "worker_resident_linear_memory_bytes", + "Per-worker cumulative linear-memory grant (total_linear_memory_size = sum of memory.grow deltas) sampled when the worker is admitted. This is the linear memory the admission gate reserves for the worker; it is an upper bound on resident RSS, not measured resident memory, since grown pages are largely demand-paged. Compare to container_memory_working_set_bytes for the gap.", crate::metrics::MEMORY_SIZE_BUCKETS.to_vec() ) .unwrap(); @@ -580,6 +717,10 @@ pub mod wasm { pub fn record_allocated_memory(amount: usize) { ALLOCATED_MEMORY_BYTES.observe(amount as f64); } + + pub fn record_worker_resident_linear_memory(bytes: u64) { + WORKER_RESIDENT_LINEAR_MEMORY_BYTES.observe(bytes as f64); + } } pub mod oplog { @@ -717,16 +858,10 @@ pub mod storage { use lazy_static::lazy_static; use prometheus::*; - /// Returns the executor identity label: POD_NAME env var, falling back to HOSTNAME, then "unknown". - /// Resolved once on first call and cached for the lifetime of the process. - pub fn executor_id() -> &'static str { - static EXECUTOR_ID: std::sync::OnceLock = std::sync::OnceLock::new(); - EXECUTOR_ID.get_or_init(|| { - std::env::var("POD_NAME") - .or_else(|_| std::env::var("HOSTNAME")) - .unwrap_or_else(|_| "unknown".to_string()) - }) - } + /// Re-exported from [`crate::identity`], which owns the process identity. + /// Kept here so existing metric-recording call sites can keep using + /// `crate::metrics::storage::executor_id()`. + pub use crate::identity::executor_id; lazy_static! { pub static ref STORAGE_FILESYSTEM_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!( @@ -741,18 +876,6 @@ pub mod storage { &["executor_id"] ) .unwrap(); - pub static ref WORKER_MEMORY_POOL_TOTAL_BYTES: GaugeVec = register_gauge_vec!( - "golem_worker_memory_pool_total_bytes", - "Configured worker-memory semaphore size in bytes for this executor", - &["executor_id"] - ) - .unwrap(); - pub static ref WORKER_MEMORY_POOL_USED_BYTES: GaugeVec = register_gauge_vec!( - "golem_worker_memory_pool_used_bytes", - "Bytes currently acquired from the worker-memory semaphore on this executor", - &["executor_id"] - ) - .unwrap(); } pub fn record_filesystem_pool_total(bytes: u64) { @@ -772,22 +895,4 @@ pub mod storage { .with_label_values(&[executor_id()]) .sub(bytes as f64); } - - pub fn record_worker_memory_pool_total(bytes: u64) { - WORKER_MEMORY_POOL_TOTAL_BYTES - .with_label_values(&[executor_id()]) - .set(bytes as f64); - } - - pub fn record_worker_memory_pool_acquired(bytes: u64) { - WORKER_MEMORY_POOL_USED_BYTES - .with_label_values(&[executor_id()]) - .add(bytes as f64); - } - - pub fn record_worker_memory_pool_released(bytes: u64) { - WORKER_MEMORY_POOL_USED_BYTES - .with_label_values(&[executor_id()]) - .sub(bytes as f64); - } } diff --git a/golem-worker-executor/src/server.rs b/golem-worker-executor/src/server.rs index fbd1c7e60c..18b286adcb 100644 --- a/golem-worker-executor/src/server.rs +++ b/golem-worker-executor/src/server.rs @@ -21,6 +21,9 @@ use std::sync::Arc; use tokio::task::JoinSet; use tracing::info; +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + fn main() -> Result<(), anyhow::Error> { match make_config_loader().load_or_dump_config() { Some(mut config) => { diff --git a/golem-worker-executor/src/services/active_workers/admission/mod.rs b/golem-worker-executor/src/services/active_workers/admission/mod.rs new file mode 100644 index 0000000000..ec57acd699 --- /dev/null +++ b/golem-worker-executor/src/services/active_workers/admission/mod.rs @@ -0,0 +1,342 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Measured-headroom admission decision. +//! +//! Gates worker admission on the executor environment's memory headroom. It is +//! the sole admission authority: there is no estimate-based semaphore behind it. +//! +//! The gate weighs two quantities against the usable ceiling: +//! +//! * Measured RSS from the [`MemoryProbe`] (cgroup `memory.current` on a +//! constrained pod) — what is resident right now. +//! * The total linear memory *granted* to live workers — what they could fault +//! in at any moment. +//! +//! Both matter because they fail in opposite directions. Measured RSS lags +//! admission: `memory.current` counts only touched pages, so a worker admitted +//! moments ago is not yet resident and a burst admitted against the same low +//! snapshot would collectively over-commit. The granted total leads residency: a +//! worker can fault in any page of the virtual memory it was already granted at +//! any later time, with no admission call to intercept it, so a gate that +//! reserved only what is resident would let a node full of lightly-touched +//! workers OOM by writing into memory they already hold. The gate therefore +//! reserves the full granted total from admission until unload, and admits +//! against the *larger* of measured RSS and that granted total — safe against +//! both the burst race and later faulting of granted pages. +//! +//! The granted total is maintained by two integer updates: a worker's grant is +//! added on admission, and removed when the [`MemoryGrant`] guard returned by +//! admission is dropped. Tying the removal to the guard's drop — rather than to +//! an explicit release call on some worker-lifecycle path — keeps the accounting +//! symmetric no matter how a worker's start ends: whether it becomes resident and +//! later stops, or its start is cancelled mid-flight (e.g. the worker is deleted +//! while still waiting for permits), dropping the guard returns its reservation +//! exactly once. The headroom check re-derives the reservation from the +//! maintained total and the current probe reading, so it is O(1) and exact +//! regardless of worker churn. +//! +//! When headroom is short the controller evicts already-resident idle-then-warm +//! work; if it still cannot make room it rejects rather than over-committing. +//! +//! The controller is decoupled from `Worker`/wasmtime via the [`EvictionSource`] +//! trait so its decision logic can be exercised in isolation with synthetic +//! probes and candidate sets. + +use super::memory_probe::MemoryProbe; +use async_trait::async_trait; +use std::sync::{Arc, Mutex}; + +/// Why an eviction candidate is worth evicting, in priority order. Lower +/// variants are evicted first. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) enum EvictionPriority { + /// Resident in memory, not executing, no durable pending work. Cheapest to + /// evict — losing it costs at most a re-load on next use. + Idle, + /// Resident in memory, not executing, but has durable pending work. Evicted + /// only after all idle candidates are exhausted. + Warm, +} + +/// A source of evictable, already-resident memory the controller can reclaim to +/// restore headroom. Abstracts over the live worker set so the decision logic +/// is testable without `Worker`/wasmtime. +#[async_trait] +pub(crate) trait EvictionSource: Send + Sync { + /// Evict at the given priority tier, attempting to free at least + /// `needed_bytes`. Returns the number of bytes actually reclaimed (which may + /// be less if the tier is exhausted, or more if a single victim was larger + /// than needed). Must not evict from a higher (more expensive) tier than the + /// one requested. + async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64; +} + +/// The outcome of an admission attempt. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum AdmissionDecision { + /// There is enough real headroom (possibly after eviction) to admit the + /// request without risking the limit. + Admit, + /// Not enough headroom could be freed; the request must back off rather + /// than over-commit. + Reject, +} + +/// Configuration for the headroom-based admission decision. +/// +/// * `usable_ratio` — fraction of the measured limit usable for WASM admission. +/// The remainder is left for the host (the executor process, allocator +/// arenas, runtime buffers). Mirrors `worker_memory_ratio`, but applied to the +/// measured limit rather than the configured total. +#[derive(Debug, Clone, Copy)] +pub(crate) struct AdmissionPolicy { + /// Fraction (0.0..=1.0) of the measured limit usable for WASM admission. + pub usable_ratio: f64, +} + +/// Decides admission against measured headroom, evicting resident idle/warm +/// work as needed. Holds its policy and probe; live usage is read fresh from the +/// probe on each call. The only retained state is `granted`: the total linear +/// memory granted to live workers, maintained across admit and unload, which the +/// gate reserves so a worker cannot OOM the node by faulting in granted pages. +pub(crate) struct AdmissionController { + probe: Box, + policy: AdmissionPolicy, + granted: Mutex, +} + +impl AdmissionController { + pub fn new(probe: Box, policy: AdmissionPolicy) -> Self { + let ceiling = (probe.snapshot().limit_bytes as f64 * policy.usable_ratio) as u64; + crate::metrics::workers::record_worker_memory_ceiling(ceiling); + Self { + probe, + policy, + granted: Mutex::new(0), + } + } + + /// Bytes available for a new admission: the usable ceiling minus the larger + /// of measured RSS and the total memory granted to live workers. Saturating — + /// never underflows when already over the ceiling. + /// + /// A worker can fault in any page of the virtual memory it was granted at any + /// time, with no admission call to intercept it, so the gate must reserve the + /// full granted total even before it is resident. Measured RSS is only larger + /// than the granted total transiently (host/runtime overhead the grant does + /// not cover), so taking the maximum keeps the gate safe against both the + /// grant a worker may yet fault in and any usage the grant does not capture. + fn admissible_headroom(&self) -> u64 { + let granted = *self.granted.lock().unwrap(); + self.headroom_with_granted(granted) + } + + /// Computes admissible headroom for an already-read `granted` value. Reads + /// the probe and emits the ceiling/RSS metrics. Kept separate from the lock + /// acquisition so the decision-and-reserve sequence can hold the lock across + /// both steps (see [`Self::try_reserve_locked`]). + fn headroom_with_granted(&self, granted: u64) -> u64 { + let snapshot = self.probe.snapshot(); + let ceiling = (snapshot.limit_bytes as f64 * self.policy.usable_ratio) as u64; + crate::metrics::workers::record_worker_memory_ceiling(ceiling); + crate::metrics::workers::record_worker_admission_rss(snapshot.current_bytes); + ceiling.saturating_sub(snapshot.current_bytes.max(granted)) + } + + /// Atomically admits `request_bytes` if the headroom computed against the + /// current granted total covers it: reads `granted`, computes headroom, and + /// adds the reservation all under one lock so two concurrent admissions + /// cannot both pass the check against the same headroom and overshoot the + /// ceiling. Returns whether the request was admitted. + fn try_reserve_locked(&self, request_bytes: u64) -> bool { + let mut granted = self.granted.lock().unwrap(); + if self.headroom_with_granted(*granted) >= request_bytes { + *granted += request_bytes; + crate::metrics::workers::record_worker_memory_granted(*granted); + true + } else { + false + } + } + + /// Record `request_bytes` of memory granted to a newly admitted worker. The + /// gate reserves this until the worker unloads, because the worker may fault + /// the granted pages in at any later time. + fn reserve(&self, request_bytes: u64) { + let mut granted = self.granted.lock().unwrap(); + *granted += request_bytes; + crate::metrics::workers::record_worker_memory_granted(*granted); + } + + /// Reserve memory for a cost that is a committed consequence of an already + /// admitted worker rather than a fresh admission — currently a component's + /// compiled module, loaded into RAM when the first worker of the component + /// becomes resident and shared by all its workers. Unlike admission this does + /// not evict or reject (the worker is already in); it accounts the bytes so + /// later admissions see them. Released with [`Self::release`]. + pub(crate) fn reserve_committed(&self, bytes: u64) { + self.reserve(bytes); + } + + /// Release the grant of a worker that has unloaded, given the bytes it was + /// granted. Its pages leave memory, so its grant no longer needs reserving; + /// not releasing it would permanently shrink admissible headroom as workers + /// come and go. + pub(crate) fn release(&self, reserved_bytes: u64) { + let mut granted = self.granted.lock().unwrap(); + *granted = granted.saturating_sub(reserved_bytes); + crate::metrics::workers::record_worker_memory_granted(*granted); + } + + /// Pre-register grant bytes for workers that were already live when the + /// controller was created. Test-only: production registers every worker's + /// grant through admission. + #[cfg(test)] + pub fn seed_granted(&self, bytes: u64) { + *self.granted.lock().unwrap() += bytes; + } + + /// Decide whether `request_bytes` can be admitted, evicting from `source` if + /// the current headroom is insufficient. + /// + /// Eviction is attempted idle-first, then warm, and only up to the shortfall + /// (never evicts when headroom already suffices). After eviction the + /// headroom is re-measured against ground truth; the request is admitted only + /// if the real headroom now covers it, otherwise it is rejected. On admit the + /// request is added to the in-flight reservation. + async fn try_admit( + &self, + request_bytes: u64, + source: &dyn EvictionSource, + ) -> AdmissionDecision { + // Fast path: atomically admit if there is already enough real headroom. + if self.try_reserve_locked(request_bytes) { + return AdmissionDecision::Admit; + } + + // Reclaim resident, idle-then-warm work up to the shortfall. + let shortfall = request_bytes.saturating_sub(self.admissible_headroom()); + let mut remaining = shortfall; + + for priority in [EvictionPriority::Idle, EvictionPriority::Warm] { + if remaining == 0 { + break; + } + let freed = source.evict_at_most(priority, remaining).await; + remaining = remaining.saturating_sub(freed); + } + + // Re-measure against ground truth rather than trusting the freed tally: + // the probe is the authority, and other activity may have moved usage + // in either direction while we were evicting. The check-and-reserve is + // atomic so a concurrent admission cannot slip in between. + if self.try_reserve_locked(request_bytes) { + AdmissionDecision::Admit + } else { + AdmissionDecision::Reject + } + } + + /// The current admissible headroom. Used by tests to assert the gate's + /// accounting; production reads headroom indirectly through admission. + #[cfg(test)] + pub(crate) fn headroom_bytes(&self) -> u64 { + self.admissible_headroom() + } + + /// Admit `request_bytes`, evicting resident idle-then-warm work if needed, + /// and on success return a [`MemoryGrant`] guard that owns the reservation + /// and releases it on drop; `None` if the request cannot be admitted. + /// + /// The grant a starting worker holds passes through several `.await` points + /// before the worker becomes resident (per-account concurrency, component + /// charge, filesystem storage); if that work is cancelled — as when the + /// worker is deleted while still waiting — the guard's drop returns the + /// reservation, so a cancelled start cannot leak headroom. + pub(crate) async fn admit( + self: &Arc, + request_bytes: u64, + source: &dyn EvictionSource, + ) -> Option { + match self.try_admit(request_bytes, source).await { + AdmissionDecision::Admit => Some(MemoryGrant { + controller: Some(self.clone()), + bytes: request_bytes, + }), + AdmissionDecision::Reject => None, + } + } +} + +/// Owns a memory reservation made with the [`AdmissionController`] and returns it +/// to the gate when dropped, so a reservation is released exactly once regardless +/// of whether the worker became resident or its start was cancelled. +/// +/// When measured admission is disabled (no controller) the grant is inert: it +/// reserves nothing and releasing it is a no-op, so callers can hold a grant +/// uniformly without branching on whether admission is active. +pub(crate) struct MemoryGrant { + controller: Option>, + bytes: u64, +} + +impl MemoryGrant { + /// An inert grant for when measured admission is disabled: holds no + /// reservation and releases nothing on drop. + pub(crate) fn inert() -> Self { + Self { + controller: None, + bytes: 0, + } + } + + /// Fold another grant's bytes into this one, so a worker that grows its + /// memory carries a single grant covering its whole reservation. The other + /// grant is consumed and its reservation transferred here; the combined total + /// is released exactly once when this grant drops. + pub(crate) fn merge(&mut self, mut other: MemoryGrant) { + if other.controller.is_some() { + // Adopt the controller so a merged grant acquired while admission was + // enabled still releases, even if `self` started inert. + if self.controller.is_none() { + self.controller = other.controller.take(); + } + self.bytes += other.bytes; + } + // Neutralize the absorbed grant so its drop does not release the bytes + // now owned by `self`. + other.bytes = 0; + other.controller = None; + } +} + +impl std::fmt::Debug for MemoryGrant { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MemoryGrant") + .field("bytes", &self.bytes) + .finish() + } +} + +impl Drop for MemoryGrant { + fn drop(&mut self) { + if let Some(controller) = &self.controller { + controller.release(self.bytes); + } + } +} + +#[cfg(test)] +mod tests; diff --git a/golem-worker-executor/src/services/active_workers/admission/tests.rs b/golem-worker-executor/src/services/active_workers/admission/tests.rs new file mode 100644 index 0000000000..6f263930b3 --- /dev/null +++ b/golem-worker-executor/src/services/active_workers/admission/tests.rs @@ -0,0 +1,1048 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Property-based and example tests for the measured-headroom admission valve. +//! +//! These tests model an executor environment as a shared cell holding a hard +//! `limit`, the current resident `usage`, and the set of resident evictable +//! work (each item carrying a size and an eviction priority). A [`FakeProbe`] +//! reports `usage`/`limit` from the cell; a [`FakeEvictionSource`] reclaims +//! idle-then-warm items and decrements `usage`. Admitting a request adds its +//! size to `usage` as a new resident, non-evictable item (it is actively being +//! created). +//! +//! The model lets `proptest` drive thousands of random admit sequences — with +//! random request sizes, pre-resident work, and limits — and assert the +//! invariants that *define* a correct safety valve: +//! +//! 1. Safety: usage never exceeds the limit (the environment never OOMs). +//! 2. No spurious eviction: when headroom is ample, nothing is evicted. +//! 3. Eviction ordering: idle work is reclaimed before warm work. +//! 4. Clean rejection: when the request genuinely cannot fit, the decision is +//! `Reject` and no over-commit happens. + +use super::*; +use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot}; +use proptest::prelude::*; +use std::sync::{Arc, Mutex}; +use test_r::test; + +test_r::enable!(); + +/// One unit of resident, evictable work in the model. +#[derive(Debug, Clone, Copy)] +struct Resident { + size: u64, + priority: EvictionPriority, +} + +/// An admitted request whose pages have not yet fully faulted into RSS. +/// +/// Models the gap between admission and residency: the worker has been admitted +/// for `reserved` bytes but only `resident` of them have actually touched memory +/// so far. Real RSS (what the probe reads) reflects only `resident`; the +/// remaining `reserved - resident` bytes are still in flight and will appear in +/// RSS later. This lag is what lets concurrent admissions on the same RSS +/// snapshot collectively over-commit. +#[derive(Debug, Clone, Copy)] +struct InFlight { + reserved: u64, + resident: u64, +} + +/// Shared model of the executor environment's memory. +#[derive(Debug, Default)] +struct EnvState { + limit: u64, + /// Resident bytes attributed to admitted, currently-active requests that + /// are not yet evictable (they are mid-admission). + pinned_usage: u64, + /// Resident, evictable work — what the controller may reclaim. + residents: Vec, + /// Admitted requests whose pages are still faulting in. Their `resident` + /// portion counts toward measured RSS now; their full `reserved` size is + /// what RSS will reach once they are fully resident. + in_flight: Vec, + /// Count of evictions performed, for the no-spurious-eviction property. + evictions: usize, + /// The priorities evicted, in order, for the ordering property. + eviction_order: Vec, +} + +impl EnvState { + /// Measured RSS: the bytes that have actually faulted in. Lags behind what + /// has been admitted, because in-flight requests are only partially + /// resident. This is what the probe reports. + fn usage(&self) -> u64 { + self.pinned_usage + + self.residents.iter().map(|r| r.size).sum::() + + self.in_flight.iter().map(|f| f.resident).sum::() + } + + /// Total bytes that admitted work will eventually occupy once every + /// in-flight request has fully faulted in. The safety property is stated + /// against this value: reserved bytes always become resident, so if this + /// can exceed the limit the environment will OOM once the lag resolves. + fn eventual_usage(&self) -> u64 { + self.pinned_usage + + self.residents.iter().map(|r| r.size).sum::() + + self.in_flight.iter().map(|f| f.reserved).sum::() + } + + /// Advance residency: each in-flight request faults in up to `step` more of + /// its reserved bytes, raising measured RSS toward its eventual size. + /// Fully-resident requests are retired into `pinned_usage`. + fn tick_residency(&mut self, step: u64) { + for f in &mut self.in_flight { + let remaining = f.reserved - f.resident; + f.resident += remaining.min(step); + } + let (done, pending): (Vec<_>, Vec<_>) = self + .in_flight + .drain(..) + .partition(|f| f.resident >= f.reserved); + self.pinned_usage += done.iter().map(|f| f.reserved).sum::(); + self.in_flight = pending; + } + + /// Fault in `step` bytes of granted-but-untouched memory belonging to the + /// in-flight request at `index`, without faulting in any other request. A + /// worker may touch the virtual memory it was already granted at any later + /// time, with no admission call in the loop, so this raises measured RSS for + /// one worker in isolation. + fn fault_in_one(&mut self, index: usize, step: u64) { + if let Some(f) = self.in_flight.get_mut(index) { + let remaining = f.reserved - f.resident; + f.resident += remaining.min(step); + } + } + + /// Remove the in-flight worker at `index`: it finishes and unloads, freeing + /// both its resident pages and its remaining grant. Measured RSS drops by its + /// resident portion. Returns the bytes it was admitted for, so the caller can + /// release the gate's reservation for it. The surviving workers' reservations + /// for their own untouched grants must not be credited by this drop. + fn exit_one(&mut self, index: usize) -> Option { + if index < self.in_flight.len() { + Some(self.in_flight.remove(index).reserved) + } else { + None + } + } +} + +#[derive(Debug, Clone)] +struct FakeProbe { + state: Arc>, +} + +impl MemoryProbe for FakeProbe { + fn snapshot(&self) -> MemorySnapshot { + let state = self.state.lock().unwrap(); + MemorySnapshot { + limit_bytes: state.limit, + current_bytes: state.usage(), + } + } +} + +struct FakeEvictionSource { + state: Arc>, + /// The gate, so eviction can release each evicted resident's grant — in + /// production, eviction unloads the worker, which releases its grant. + controller: Arc, +} + +#[async_trait::async_trait] +impl EvictionSource for FakeEvictionSource { + async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64 { + let mut state = self.state.lock().unwrap(); + let mut freed = 0u64; + // Evict only at the requested tier, oldest-first (model: vec order), + // until we have freed at least `needed_bytes` or the tier is empty. + let mut i = 0; + while freed < needed_bytes && i < state.residents.len() { + if state.residents[i].priority == priority { + let victim = state.residents.remove(i); + freed += victim.size; + self.controller.release(victim.size); + state.evictions += 1; + state.eviction_order.push(priority); + } else { + i += 1; + } + } + freed + } +} + +fn controller(state: Arc>) -> Arc { + controller_with_ratio(state, 1.0) +} + +fn controller_with_ratio( + state: Arc>, + usable_ratio: f64, +) -> Arc { + // Workers already resident when the gate is created had their grants + // registered at their own admission; seed the gate to match. + let initial_granted = { + let s = state.lock().unwrap(); + s.pinned_usage + s.residents.iter().map(|r| r.size).sum::() + }; + let controller = AdmissionController::new( + Box::new(FakeProbe { + state: state.clone(), + }), + AdmissionPolicy { usable_ratio }, + ); + controller.seed_granted(initial_granted); + Arc::new(controller) +} + +fn eviction_source( + state: Arc>, + controller: Arc, +) -> FakeEvictionSource { + FakeEvictionSource { state, controller } +} + +/// Apply one admission attempt against the model, mutating `usage` on admit. +async fn apply_admit( + controller: &AdmissionController, + source: &FakeEvictionSource, + state: &Arc>, + request: u64, +) -> AdmissionDecision { + let decision = controller.try_admit(request, source).await; + if decision == AdmissionDecision::Admit { + state.lock().unwrap().pinned_usage += request; + } + decision +} + +/// Apply one admission attempt where admitted bytes do NOT become resident +/// immediately. On admit the request is recorded as in-flight with zero resident +/// bytes, so measured RSS is unchanged until a later residency tick faults its +/// pages in. This models the real lag between admission and RSS, the window in +/// which concurrent admissions on the same snapshot can collectively +/// over-commit. +async fn apply_staggered_admit( + controller: &AdmissionController, + source: &FakeEvictionSource, + state: &Arc>, + request: u64, +) -> AdmissionDecision { + let decision = controller.try_admit(request, source).await; + if decision == AdmissionDecision::Admit { + state.lock().unwrap().in_flight.push(InFlight { + reserved: request, + resident: 0, + }); + } + decision +} + +/// A probe with a fixed limit that always reports zero current usage, so the +/// gate's admission decision is driven solely by the granted accounting against +/// the ceiling. Used by the concurrency test, where the property under test is +/// that the granted counter cannot be over-committed by racing admissions. +#[derive(Debug)] +struct ZeroUsageProbe { + limit: u64, +} + +impl MemoryProbe for ZeroUsageProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.limit, + current_bytes: 0, + } + } +} + +/// An eviction source with nothing to evict: a rejected request stays rejected. +struct NoEvictionSource; + +#[async_trait::async_trait] +impl EvictionSource for NoEvictionSource { + async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 { + 0 + } +} + +/// Concurrent admissions must never grant more than the ceiling allows. +/// +/// Many admit attempts of equal size race against a controller whose ceiling +/// admits only a known number of them, with no evictable work to fall back on. +/// Exactly `ceiling / request` requests must be admitted and the rest rejected; +/// the total granted must never exceed the ceiling. This can only hold if each +/// admission's "is there room? then reserve" sequence is atomic against the +/// others — if two admits read the same headroom before either reserves, both +/// pass and the granted total overshoots the ceiling. +#[test] +fn concurrent_admissions_never_overcommit_the_ceiling() { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(8) + .build() + .unwrap(); + + rt.block_on(async { + const REQUEST: u64 = 10; + const CAPACITY: u64 = 50; // exactly 5 requests fit + const ATTEMPTS: usize = 200; // far more than fit, all racing + + let controller = Arc::new(AdmissionController::new( + Box::new(ZeroUsageProbe { limit: CAPACITY }), + AdmissionPolicy { usable_ratio: 1.0 }, + )); + + let mut handles = Vec::with_capacity(ATTEMPTS); + for _ in 0..ATTEMPTS { + let controller = controller.clone(); + handles.push(tokio::spawn(async move { + controller.try_admit(REQUEST, &NoEvictionSource).await + })); + } + + let mut admitted = 0usize; + for handle in handles { + if handle.await.unwrap() == AdmissionDecision::Admit { + admitted += 1; + } + } + + let expected = (CAPACITY / REQUEST) as usize; + assert_eq!( + admitted, expected, + "expected exactly {expected} admissions to fit, got {admitted}" + ); + // With zero measured usage, headroom is the ceiling minus granted; if it + // equals the full ceiling again, everything admitted was released, which + // never happens here. The decisive check: the admitted total fits. + assert!( + admitted as u64 * REQUEST <= CAPACITY, + "granted {} exceeded ceiling {CAPACITY}", + admitted as u64 * REQUEST + ); + }); +} + +// ── Single-case unit tests ─────────────────────────────────────────────────── + +#[test] +async fn admits_when_headroom_is_ample_without_evicting() { + let state = Arc::new(Mutex::new(EnvState { + limit: 1000, + pinned_usage: 0, + residents: vec![Resident { + size: 100, + priority: EvictionPriority::Idle, + }], + ..Default::default() + })); + let ctrl = controller(state.clone()); + let source = eviction_source(state.clone(), ctrl.clone()); + + let decision = apply_admit(&ctrl, &source, &state, 200).await; + assert_eq!(decision, AdmissionDecision::Admit); + // Nothing should have been evicted — there was plenty of headroom. + assert_eq!(state.lock().unwrap().evictions, 0); +} + +#[test] +async fn evicts_idle_before_warm() { + let state = Arc::new(Mutex::new(EnvState { + limit: 1000, + pinned_usage: 0, + residents: vec![ + Resident { + size: 400, + priority: EvictionPriority::Warm, + }, + Resident { + size: 400, + priority: EvictionPriority::Idle, + }, + ], + ..Default::default() + })); + // usage = 800, limit = 1000, headroom = 200. Request 300 → shortfall 100. + // One idle (400) covers it; warm must remain untouched. + let ctrl = controller(state.clone()); + let source = eviction_source(state.clone(), ctrl.clone()); + + let decision = apply_admit(&ctrl, &source, &state, 300).await; + assert_eq!(decision, AdmissionDecision::Admit); + + let s = state.lock().unwrap(); + assert_eq!(s.eviction_order, vec![EvictionPriority::Idle]); + assert!(s.usage() <= s.limit); +} + +#[test] +async fn rejects_when_nothing_can_be_freed() { + let state = Arc::new(Mutex::new(EnvState { + limit: 1000, + // All usage is pinned (mid-admission), nothing evictable. + pinned_usage: 950, + residents: vec![], + ..Default::default() + })); + let ctrl = controller(state.clone()); + let source = eviction_source(state.clone(), ctrl.clone()); + + let decision = apply_admit(&ctrl, &source, &state, 200).await; + assert_eq!(decision, AdmissionDecision::Reject); + // No over-commit: usage unchanged. + assert_eq!(state.lock().unwrap().usage(), 950); +} + +// ── Property tests ─────────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +enum Op { + Admit(u64), +} + +/// An operation in a staggered-start schedule. Unlike [`Op`], admitted bytes do +/// not become resident immediately — `Tick` advances residency separately, so +/// the schedule can interleave admissions and page-faulting in any order. +#[derive(Debug, Clone)] +enum StaggeredOp { + /// Attempt to admit a worker reserving this many bytes. + Admit(u64), + /// Fault in up to this many more bytes of every in-flight worker. + Tick(u64), +} + +fn arb_resident_priority() -> impl Strategy { + prop_oneof![Just(EvictionPriority::Idle), Just(EvictionPriority::Warm)] +} + +fn arb_ops() -> impl Strategy> { + prop::collection::vec((1u64..800).prop_map(Op::Admit), 0..40) +} + +/// Strategy yielding a `(limit, residents)` start state where the residents fit +/// under the limit by construction, by carving each resident's size out of a +/// remaining budget. A resident set exceeding the limit cannot occur in reality +/// (it would already have been OOM-killed), so it is not a valid start state. +fn arb_fitting_state( + limit_range: std::ops::Range, + max_residents: usize, +) -> impl Strategy)> { + limit_range.prop_flat_map(move |limit| { + // Reserve a fraction of the limit for residents (0..=80%) so there is + // usually some free headroom in the start state too. Each resident then + // takes a slice of that budget. + ( + Just(limit), + (0u64..=(limit * 4 / 5)), + prop::collection::vec((1u64..=1000, arb_resident_priority()), 0..max_residents), + ) + .prop_map(|(limit, mut budget, raw)| { + let mut residents = Vec::new(); + for (weight, priority) in raw { + if budget == 0 { + break; + } + // Each resident is at most a third of the remaining budget, + // so several can coexist; clamp to whatever budget is left. + let size = weight.min(budget.div_ceil(3)).max(1).min(budget); + residents.push(Resident { size, priority }); + budget -= size; + } + (limit, residents) + }) + }) +} + +proptest! { + /// Safety invariant: across any random sequence of admits — with random + /// pre-resident work and random sizes — modeled usage must never exceed the + /// limit. This is the property that rules out OOM. + #[test] + fn usage_never_exceeds_limit( + (limit, residents) in arb_fitting_state(500..5000, 20), + ops in arb_ops(), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { + limit, + pinned_usage: 0, + residents, + ..Default::default() + })); + let ctrl = controller(state.clone()); + let source = eviction_source(state.clone(), ctrl.clone()); + + for op in ops { + match op { + Op::Admit(req) => { + apply_admit(&ctrl, &source, &state, req).await; + let s = state.lock().unwrap(); + prop_assert!( + s.usage() <= s.limit, + "usage {} exceeded limit {}", s.usage(), s.limit + ); + } + } + } + Ok(()) + }).unwrap(); + } + + /// No spurious eviction: if every admit in the sequence fits within the + /// admissible headroom at the moment it is issued, nothing is ever evicted. + /// We guarantee the precondition by giving a huge limit and small requests. + #[test] + fn no_eviction_when_headroom_ample( + residents in prop::collection::vec( + (1u64..500, arb_resident_priority()) + .prop_map(|(size, priority)| Resident { size, priority }), + 0..20, + ), + ops in prop::collection::vec((1u64..50).prop_map(Op::Admit), 0..30), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { + limit: 1_000_000, + pinned_usage: 0, + residents, + ..Default::default() + })); + let ctrl = controller(state.clone()); + let source = eviction_source(state.clone(), ctrl.clone()); + + for op in ops { + match op { + Op::Admit(req) => { apply_admit(&ctrl, &source, &state, req).await; } + } + } + prop_assert_eq!(state.lock().unwrap().evictions, 0); + Ok(()) + }).unwrap(); + } + + /// Eviction ordering: whenever eviction happens, no warm item is evicted + /// while an idle item was still available to evict at that step. We check + /// the weaker, order-level invariant that the recorded eviction order never + /// has a warm eviction before an idle one within a single `try_admit` call + /// — i.e. idle is always drained first. + #[test] + fn idle_evicted_before_warm( + (limit, residents) in arb_fitting_state(500..3000, 25), + ops in prop::collection::vec((1u64..1500).prop_map(Op::Admit), 1..20), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { + limit, + pinned_usage: 0, + residents, + ..Default::default() + })); + let ctrl = controller(state.clone()); + let source = eviction_source(state.clone(), ctrl.clone()); + + for op in ops { + match op { + Op::Admit(req) => { apply_admit(&ctrl, &source, &state, req).await; } + } + } + + // Once a warm eviction appears in the order, an idle eviction must + // never follow it (idle is always exhausted first). + let order = state.lock().unwrap().eviction_order.clone(); + let mut seen_warm = false; + for p in order { + match p { + EvictionPriority::Warm => seen_warm = true, + EvictionPriority::Idle => prop_assert!( + !seen_warm, + "idle eviction followed a warm eviction" + ), + } + } + Ok(()) + }).unwrap(); + } +} + +// ── Staggered-start safety ─────────────────────────────────────────────────── + +/// A schedule of admissions interleaved with residency ticks. Admissions +/// reserve bytes that only become resident when a later `Tick` faults them in, +/// so the schedule exercises the lag between admission and measured RSS in which +/// concurrent admissions can collectively over-commit. Skewed toward `Admit` so +/// bursts of admissions land between ticks (the dangerous case). +fn arb_staggered_schedule() -> impl Strategy> { + prop::collection::vec( + prop_oneof![ + 3 => (1u64..800).prop_map(StaggeredOp::Admit), + 1 => (1u64..800).prop_map(StaggeredOp::Tick), + ], + 0..60, + ) +} + +proptest! { + /// Safety invariant under staggered starts: for any interleaving of + /// admissions and residency ticks, once every admitted worker has fully + /// faulted its pages in, resident usage must not exceed the limit. + /// + /// Reserved bytes always eventually become resident, so the check is made + /// against the state after a final full-residency tick: if that can exceed + /// the limit, the environment OOMs once the admission lag resolves. This is + /// the general form of the staggered-burst case — admissions that read the + /// same low RSS snapshot before each other's pages are counted. + #[test] + fn staggered_starts_never_exceed_limit_once_resident( + (limit, residents) in arb_fitting_state(500..5000, 20), + schedule in arb_staggered_schedule(), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { + limit, + pinned_usage: 0, + residents, + ..Default::default() + })); + let ctrl = controller(state.clone()); + let source = eviction_source(state.clone(), ctrl.clone()); + + for op in schedule { + match op { + StaggeredOp::Admit(req) => { + apply_staggered_admit(&ctrl, &source, &state, req).await; + } + StaggeredOp::Tick(step) => { + state.lock().unwrap().tick_residency(step); + } + } + // Even mid-flight, measured RSS must never exceed the limit. + let s = state.lock().unwrap(); + prop_assert!( + s.usage() <= s.limit, + "resident usage {} exceeded limit {} mid-schedule", s.usage(), s.limit + ); + } + + // Fault in everything still in flight, then check the eventual + // resident footprint fits. + state.lock().unwrap().tick_residency(u64::MAX); + let s = state.lock().unwrap(); + prop_assert!( + s.eventual_usage() <= s.limit, + "eventual resident usage {} exceeded limit {} once fully resident", + s.eventual_usage(), s.limit + ); + Ok(()) + }).unwrap(); + } +} + +// ── Granted virtual memory ─────────────────────────────────────────────────── + +/// One step of a schedule that stresses granted-but-untouched memory. +#[derive(Debug, Clone)] +enum GrantOp { + /// Attempt to admit a worker granted this many bytes of linear memory. + Grant(u64), + /// Fault in up to this many bytes of the in-flight worker at this index, + /// in isolation from the others. + FaultIn(usize, u64), + /// The in-flight worker at this index finishes and unloads, dropping its + /// resident pages and its remaining grant. + Exit(usize), +} + +fn arb_grant_schedule() -> impl Strategy> { + prop::collection::vec( + prop_oneof![ + 3 => (1u64..800).prop_map(GrantOp::Grant), + 3 => (0usize..20, 1u64..800).prop_map(|(i, step)| GrantOp::FaultIn(i, step)), + 1 => (0usize..20).prop_map(GrantOp::Exit), + ], + 0..80, + ) +} + +proptest! { + /// A worker may fault in the virtual memory it was already granted at any + /// later time, with no admission call in the loop. Once every granted byte + /// of every admitted worker becomes resident, that resident footprint must + /// not exceed the limit. + /// + /// Granted bytes can always become resident — nothing in the runtime forces + /// a worker to leave granted pages untouched — so the safety check is made + /// against the sum of granted sizes after faulting everything in. If that + /// can exceed the limit, a node of workers touching their already-granted + /// pages will OOM with no grow and no admission to intercept it. + #[test] + fn granted_memory_never_exceeds_limit_once_faulted_in( + limit in 800u64..6000, + schedule in arb_grant_schedule(), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let state = Arc::new(Mutex::new(EnvState { limit, ..Default::default() })); + // usable_ratio 1.0 isolates the granted-memory hole from the host + // carve-out. + let ctrl = controller(state.clone()); + let source = eviction_source(state.clone(), ctrl.clone()); + + for op in schedule { + match op { + GrantOp::Grant(bytes) => { + apply_staggered_admit(&ctrl, &source, &state, bytes).await; + } + GrantOp::FaultIn(index, step) => { + state.lock().unwrap().fault_in_one(index, step); + } + GrantOp::Exit(index) => { + let reserved = state.lock().unwrap().exit_one(index); + if let Some(reserved) = reserved { + ctrl.release(reserved); + } + } + } + let s = state.lock().unwrap(); + prop_assert!( + s.usage() <= s.limit, + "resident usage {} exceeded limit {} mid-schedule", s.usage(), s.limit + ); + } + + // Every granted byte may yet fault in. Once it all does, it must fit. + state.lock().unwrap().tick_residency(u64::MAX); + let s = state.lock().unwrap(); + prop_assert!( + s.eventual_usage() <= s.limit, + "granted memory {} exceeded limit {} once fully faulted in", + s.eventual_usage(), s.limit + ); + Ok(()) + }).unwrap(); + } + + /// Liveness: once every admitted worker has unloaded and its pages have left + /// memory, the gate's admissible headroom must return to the full ceiling. + /// + /// Reservations for workers that exit while still holding untouched granted + /// memory must be released on unload. If they were not, each such exit would + /// permanently shrink headroom, and a node churning workers would slowly + /// refuse all admissions despite being empty. + #[test] + fn headroom_recovers_after_all_workers_exit( + limit in 800u64..6000, + schedule in arb_grant_schedule(), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let usable_ratio = 0.8; + let state = Arc::new(Mutex::new(EnvState { limit, ..Default::default() })); + let ctrl = controller_with_ratio(state.clone(), usable_ratio); + let source = eviction_source(state.clone(), ctrl.clone()); + + for op in schedule { + match op { + GrantOp::Grant(bytes) => { + apply_staggered_admit(&ctrl, &source, &state, bytes).await; + } + GrantOp::FaultIn(index, step) => { + state.lock().unwrap().fault_in_one(index, step); + } + GrantOp::Exit(index) => { + let reserved = state.lock().unwrap().exit_one(index); + if let Some(reserved) = reserved { + ctrl.release(reserved); + } + } + } + } + + // Unload every worker still resident, releasing each reservation, and + // clear measured RSS — the environment is now empty. + loop { + let reserved = state.lock().unwrap().exit_one(0); + match reserved { + Some(reserved) => ctrl.release(reserved), + None => break, + } + } + { + let mut s = state.lock().unwrap(); + s.pinned_usage = 0; + s.residents.clear(); + } + + let ceiling = (limit as f64 * usable_ratio) as u64; + let headroom = ctrl.headroom_bytes(); + prop_assert_eq!( + headroom, ceiling, + "headroom {} did not recover to ceiling {} after all workers exited", + headroom, ceiling + ); + Ok(()) + }).unwrap(); + } +} + +// ── Density ────────────────────────────────────────────────────────────────── + +proptest! { + /// Density invariant: in a settled state (no admission lag outstanding), the + /// gate packs the environment to within one request of the usable ceiling + /// before it starts rejecting. It must not stop admitting while substantial + /// usable room remains. + /// + /// The schedule admits a fixed request size, fully faulting each admitted + /// worker in before the next admit so measured RSS tracks admitted bytes and + /// the in-flight reservation drains to zero — the steady-state regime where + /// density matters. At the first rejection, resident usage must be at least + /// `ceiling - request`: the only room a correct gate may leave free is the + /// part too small to fit one more request. + #[test] + fn admits_to_within_one_request_of_the_ceiling( + limit in 2000u64..20_000, + request in 50u64..600, + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let usable_ratio = 0.8; + let state = Arc::new(Mutex::new(EnvState { + limit, + ..Default::default() + })); + let ctrl = controller_with_ratio(state.clone(), usable_ratio); + let source = eviction_source(state.clone(), ctrl.clone()); + + let ceiling = (limit as f64 * usable_ratio) as u64; + + // Admit until the first rejection, faulting each worker fully in + // before the next so no reservation lag is outstanding. + let mut rejected = false; + for _ in 0..((limit / request) + 2) { + let decision = apply_staggered_admit(&ctrl, &source, &state, request).await; + state.lock().unwrap().tick_residency(u64::MAX); + if decision == AdmissionDecision::Reject { + rejected = true; + break; + } + } + + prop_assert!(rejected, "gate never rejected; ceiling {ceiling} too large for the schedule"); + + let s = state.lock().unwrap(); + prop_assert!( + s.usage() + request > ceiling, + "gate rejected at resident usage {} with ceiling {ceiling}: left more than one request ({request}) of usable room free", + s.usage() + ); + // And it must never have over-committed. + prop_assert!(s.eventual_usage() <= s.limit); + Ok(()) + }).unwrap(); + } +} + +// ── Carve-out ratio ────────────────────────────────────────────────────────── + +#[test] +async fn usable_ratio_caps_admission_below_full_limit() { + let state = Arc::new(Mutex::new(EnvState { + limit: 1000, + pinned_usage: 0, + residents: vec![], + ..Default::default() + })); + // ceiling = 0.8 * 1000 = 800. Request 850 must be rejected even though the + // raw limit (1000) would allow it — the top 20% is reserved for the host. + let ctrl = controller_with_ratio(state.clone(), 0.8); + let source = eviction_source(state.clone(), ctrl.clone()); + + assert_eq!( + apply_admit(&ctrl, &source, &state, 850).await, + AdmissionDecision::Reject + ); + assert_eq!( + apply_admit(&ctrl, &source, &state, 800).await, + AdmissionDecision::Admit + ); +} + +/// Concurrent memory grows must not deadlock against the admission eviction +/// scan. +/// +/// A memory grow acquires a permit while the growing worker holds its own +/// instance lock, and the admission slow path scans the worker set, taking each +/// other worker's instance lock to classify it for eviction. With many workers +/// growing at once under memory pressure these two must not form an AB-BA cycle. +/// Workloads that never grow memory never exercise this path. +mod grow_lock_ordering { + use super::super::{AdmissionController, AdmissionPolicy, EvictionPriority, EvictionSource}; + use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot}; + use std::sync::Arc; + use std::time::Duration; + use test_r::test; + use tokio::sync::Mutex as AsyncMutex; + + /// Per-worker lock, standing in for `Worker::instance`. + type WorkerLock = Arc>; + + /// Probe pinned to zero admissible headroom so `try_admit` takes the slow + /// (scanning) path, modelling the moment a grow's requested delta does not + /// fit the current headroom. + #[derive(Debug)] + struct SaturatedProbe; + + impl MemoryProbe for SaturatedProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: 1, + current_bytes: u64::MAX, + } + } + } + + /// Probe reporting ample headroom so `try_admit` takes the fast path and + /// never scans — the same grow code path, but not under memory pressure. + #[derive(Debug)] + struct AmpleHeadroomProbe; + + impl MemoryProbe for AmpleHeadroomProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: u64::MAX, + current_bytes: 0, + } + } + } + + /// Eviction source that, like `evict_at_most_memory`, scans every worker and + /// takes each worker's instance lock (via `eviction_class`) to classify it. + /// Frees nothing (all workers active). The lock on each worker is held only + /// briefly, faithfully — the deadlock comes from the ordering, not hold time. + struct ScanningEvictionSource { + workers: Vec, + } + + #[async_trait::async_trait] + impl EvictionSource for ScanningEvictionSource { + async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 { + for worker in &self.workers { + let _guard = worker.lock().await; + } + 0 + } + } + + /// Models the grow path's lock interaction: run the admission scan, which + /// takes other workers' instance locks, without holding this worker's own + /// instance lock, then take it afterwards to merge the permit (as + /// `Worker::increase_memory` does). + async fn grow_then_lock( + controller: &AdmissionController, + own: &WorkerLock, + workers: Vec, + ) { + let source = ScanningEvictionSource { workers }; + controller.try_admit(1, &source).await; + let _own_guard = own.lock().await; + } + + fn workers(n: usize) -> Vec { + (0..n).map(|_| Arc::new(AsyncMutex::new(()))).collect() + } + + fn controller(probe: Box) -> Arc { + Arc::new(AdmissionController::new( + probe, + AdmissionPolicy { usable_ratio: 1.0 }, + )) + } + + /// Many workers growing concurrently under memory pressure (every grow takes + /// the scanning slow path) must all complete without deadlocking. + #[test(flavor = "multi_thread", worker_threads = 4)] + async fn concurrent_grows_do_not_deadlock_under_pressure() { + const WORKERS: usize = 32; + const DEADLINE: Duration = Duration::from_secs(10); + + let workers = workers(WORKERS); + let controller = controller(Box::new(SaturatedProbe)); + + let mut grows = Vec::new(); + for i in 0..WORKERS { + let controller = controller.clone(); + let all = workers.clone(); + let own = workers[i].clone(); + grows.push(tokio::spawn(async move { + grow_then_lock(&controller, &own, all).await; + })); + } + + let all_done = async { + for task in grows { + let _ = task.await; + } + }; + + let result = tokio::time::timeout(DEADLINE, all_done).await; + assert!( + result.is_ok(), + "concurrent grows deadlocked: the scan must not run while a worker holds its own instance lock" + ); + } + + /// With comfortable headroom the gate admits on the fast path without + /// scanning, so no worker's instance lock is taken during admission and + /// concurrent grows complete. Confirms the deadlock risk is specific to the + /// scan-under-pressure path. + #[test(flavor = "multi_thread", worker_threads = 4)] + async fn no_deadlock_with_ample_headroom() { + const WORKERS: usize = 32; + const DEADLINE: Duration = Duration::from_secs(10); + + let workers = workers(WORKERS); + let controller = controller(Box::new(AmpleHeadroomProbe)); + + let mut grows = Vec::new(); + for i in 0..WORKERS { + let controller = controller.clone(); + let all = workers.clone(); + let own = workers[i].clone(); + grows.push(tokio::spawn(async move { + grow_then_lock(&controller, &own, all).await; + })); + } + + let all_done = async { + for task in grows { + let _ = task.await; + } + }; + + let result = tokio::time::timeout(DEADLINE, all_done).await; + assert!( + result.is_ok(), + "grows with ample headroom should not scan and should not deadlock" + ); + } +} diff --git a/golem-worker-executor/src/services/active_workers/component_charge/mod.rs b/golem-worker-executor/src/services/active_workers/component_charge/mod.rs new file mode 100644 index 0000000000..8ddd4aa8aa --- /dev/null +++ b/golem-worker-executor/src/services/active_workers/component_charge/mod.rs @@ -0,0 +1,171 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Per-component memory charge for the shared compiled module. +//! +//! A component's compiled module is loaded into the wasmtime engine once and +//! shared by every worker of that component, so its size must be charged to the +//! memory pool once per resident component rather than once per worker. This +//! registry tracks how many workers of each component are resident and holds a +//! single module-sized charge for as long as at least one is. +//! +//! The charge is represented by an opaque guard obtained from a [`ChargeSource`] +//! (the worker memory pool in production). The first resident worker of a +//! component acquires the charge; the last to unload drops it. The registry is +//! decoupled from the pool via [`ChargeSource`] so the refcounting can be +//! property-tested in isolation. + +use async_trait::async_trait; +use std::collections::HashMap; +use std::fmt::Debug; +use std::hash::Hash; +use std::sync::{Arc, Mutex}; + +/// Acquires an opaque, RAII charge of a given byte size from some pool. The +/// returned value releases the charge when dropped. +#[async_trait] +pub trait ChargeSource: Send + Sync { + type Charge: Send + Sync + 'static; + + async fn acquire_charge(&self, bytes: u64) -> Self::Charge; +} + +/// Tracks resident-worker refcounts per component key and holds one module-sized +/// charge per component while any worker of it is resident. +pub struct ComponentChargeRegistry { + source: S, + state: Mutex>>, +} + +struct Entry { + refcount: usize, + /// The held module charge. Always `Some` while `refcount > 0`. + charge: Option>, +} + +/// Handle representing one worker's residency of a component. While at least one +/// `ComponentChargeGuard` for a key is alive, the registry holds that +/// component's module charge. Dropping the last guard releases it. +pub struct ComponentChargeGuard +where + K: Eq + Hash + Clone + Send + 'static, +{ + registry: Arc>, + key: K, +} + +impl Debug for ComponentChargeGuard +where + K: Eq + Hash + Clone + Send + 'static, + S: ChargeSource, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ComponentChargeGuard").finish() + } +} + +/// Type-erased held component charge. A worker holds one of these for as long as +/// it is resident; dropping it releases the worker's residency of its component. +/// Erasing the source/key types lets non-generic holders store the guard. +pub trait HeldComponentCharge: Send + Sync + Debug {} + +impl HeldComponentCharge for ComponentChargeGuard +where + K: Eq + Hash + Clone + Send + Sync + 'static, + S: ChargeSource + 'static, + S::Charge: Sync, +{ +} + +impl ComponentChargeRegistry +where + K: Eq + Hash + Clone + Send + 'static, + S: ChargeSource, +{ + pub fn new(source: S) -> Arc { + Arc::new(Self { + source, + state: Mutex::new(HashMap::new()), + }) + } + + /// Register one resident worker of `key` (whose module is `charge_bytes`). + /// Acquires the module charge if this is the first resident worker of the + /// component. The returned guard releases residency on drop. + pub async fn acquire( + self: &Arc, + key: K, + charge_bytes: u64, + ) -> ComponentChargeGuard { + // Decide under the lock whether this caller is the one that must acquire + // the (possibly blocking) charge, so only the first resident worker of a + // component does so. Acquire the charge outside the lock, then publish it. + let must_acquire = { + let mut state = self.state.lock().unwrap(); + let entry = state.entry(key.clone()).or_insert(Entry { + refcount: 0, + charge: None, + }); + entry.refcount += 1; + entry.refcount == 1 + }; + + if must_acquire { + let charge = Arc::new(self.source.acquire_charge(charge_bytes).await); + let mut state = self.state.lock().unwrap(); + if let Some(entry) = state.get_mut(&key) { + // Only publish if still resident (refcount could have churned). + if entry.refcount > 0 && entry.charge.is_none() { + entry.charge = Some(charge); + } + } + } + + ComponentChargeGuard { + registry: self.clone(), + key, + } + } + + fn release(&self, key: &K) { + let mut state = self.state.lock().unwrap(); + if let Some(entry) = state.get_mut(key) { + entry.refcount = entry.refcount.saturating_sub(1); + if entry.refcount == 0 { + // Drop the held charge (returns it to the pool) and forget the + // component entirely. + state.remove(key); + } + } + } +} + +impl Drop for ComponentChargeGuard +where + K: Eq + Hash + Clone + Send + 'static, + S: ChargeSource, +{ + fn drop(&mut self) { + self.registry.release(&self.key); + } +} + +impl Debug for ComponentChargeRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ComponentChargeRegistry").finish() + } +} + +#[cfg(test)] +mod tests; diff --git a/golem-worker-executor/src/services/active_workers/component_charge/tests.rs b/golem-worker-executor/src/services/active_workers/component_charge/tests.rs new file mode 100644 index 0000000000..c58f1ab937 --- /dev/null +++ b/golem-worker-executor/src/services/active_workers/component_charge/tests.rs @@ -0,0 +1,206 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Tests for the per-component module charge registry. +//! +//! A [`FakeChargeSource`] models a pool by tracking total charged bytes in an +//! atomic; each charge it hands out decrements that total when dropped. The +//! tests then assert the registry's contract: a component's module is charged +//! exactly once while any worker of it is resident, released when the last +//! unloads, and never leaked or double-charged under concurrent churn. + +use super::*; +use proptest::prelude::*; +use std::sync::atomic::{AtomicU64, Ordering}; +use test_r::test; + +test_r::enable!(); + +/// A charge that returns `bytes` to the shared counter when dropped. +struct FakeCharge { + bytes: u64, + charged_total: Arc, +} + +impl Drop for FakeCharge { + fn drop(&mut self) { + self.charged_total.fetch_sub(self.bytes, Ordering::SeqCst); + } +} + +#[derive(Clone)] +struct FakeChargeSource { + charged_total: Arc, + /// Number of times a charge was actually acquired, to detect double-charge. + acquire_count: Arc, +} + +impl FakeChargeSource { + fn new() -> Self { + Self { + charged_total: Arc::new(AtomicU64::new(0)), + acquire_count: Arc::new(AtomicU64::new(0)), + } + } +} + +#[async_trait::async_trait] +impl ChargeSource for FakeChargeSource { + type Charge = FakeCharge; + + async fn acquire_charge(&self, bytes: u64) -> FakeCharge { + self.acquire_count.fetch_add(1, Ordering::SeqCst); + self.charged_total.fetch_add(bytes, Ordering::SeqCst); + FakeCharge { + bytes, + charged_total: self.charged_total.clone(), + } + } +} + +const MODULE_BYTES: u64 = 17 * 1024 * 1024; + +// ── Single-case unit tests ─────────────────────────────────────────────────── + +#[test] +async fn first_worker_charges_once_last_releases() { + let source = FakeChargeSource::new(); + let charged = source.charged_total.clone(); + let count = source.acquire_count.clone(); + let registry = ComponentChargeRegistry::new(source); + + let g1 = registry.acquire("comp-a", MODULE_BYTES).await; + assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES); + assert_eq!(count.load(Ordering::SeqCst), 1); + + // Second worker of the same component: no additional charge. + let g2 = registry.acquire("comp-a", MODULE_BYTES).await; + assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES); + assert_eq!(count.load(Ordering::SeqCst), 1); + + // Dropping one of two keeps the charge. + drop(g1); + assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES); + + // Dropping the last releases it. + drop(g2); + assert_eq!(charged.load(Ordering::SeqCst), 0); +} + +#[test] +async fn distinct_components_each_charge_once() { + let source = FakeChargeSource::new(); + let charged = source.charged_total.clone(); + let registry = ComponentChargeRegistry::new(source); + + let _a = registry.acquire("comp-a", MODULE_BYTES).await; + let _b = registry.acquire("comp-b", MODULE_BYTES).await; + let _b2 = registry.acquire("comp-b", MODULE_BYTES).await; + + // Two distinct components → charged twice, regardless of worker count. + assert_eq!(charged.load(Ordering::SeqCst), 2 * MODULE_BYTES); +} + +#[test] +async fn re_acquiring_after_full_release_charges_again() { + let source = FakeChargeSource::new(); + let charged = source.charged_total.clone(); + let count = source.acquire_count.clone(); + let registry = ComponentChargeRegistry::new(source); + + drop(registry.acquire("comp-a", MODULE_BYTES).await); + assert_eq!(charged.load(Ordering::SeqCst), 0); + + // A fresh residency after full release acquires the charge again. + let _g = registry.acquire("comp-a", MODULE_BYTES).await; + assert_eq!(charged.load(Ordering::SeqCst), MODULE_BYTES); + assert_eq!(count.load(Ordering::SeqCst), 2); +} + +// ── Property tests ─────────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +enum Op { + /// Acquire a guard for component index `usize`. + Acquire(usize), + /// Drop the n-th currently-held guard (modulo number held). + Drop(usize), +} + +fn arb_ops(num_components: usize) -> impl Strategy> { + prop::collection::vec( + prop_oneof![ + (0..num_components).prop_map(Op::Acquire), + (0usize..100).prop_map(Op::Drop), + ], + 0..80, + ) +} + +proptest! { + /// The charged total always equals the sum of `MODULE_BYTES` over the distinct + /// components that currently have at least one held guard. This is the core + /// "once per resident component" contract: never per-worker, never leaked, + /// never double-charged. + #[test] + fn charge_tracks_distinct_resident_components( + num_components in 1usize..6, + ops in arb_ops(6), + ) { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + rt.block_on(async move { + let source = FakeChargeSource::new(); + let charged = source.charged_total.clone(); + let registry = ComponentChargeRegistry::new(source); + + // Held guards keyed by component index. + let mut held: Vec<(usize, ComponentChargeGuard<&'static str, FakeChargeSource>)> = + Vec::new(); + let keys: Vec<&'static str> = + ["c0", "c1", "c2", "c3", "c4", "c5"][..num_components].to_vec(); + + for op in ops { + match op { + Op::Acquire(i) => { + let i = i % num_components; + let guard = registry.acquire(keys[i], MODULE_BYTES).await; + held.push((i, guard)); + } + Op::Drop(n) => { + if !held.is_empty() { + let idx = n % held.len(); + held.remove(idx); + } + } + } + + // Distinct resident component count == charged_total / MODULE_BYTES. + let mut distinct: Vec = held.iter().map(|(i, _)| *i).collect(); + distinct.sort_unstable(); + distinct.dedup(); + let expected = distinct.len() as u64 * MODULE_BYTES; + prop_assert_eq!( + charged.load(Ordering::SeqCst), + expected, + "charged total did not match distinct resident components" + ); + } + + // After dropping everything, nothing remains charged. + drop(held); + prop_assert_eq!(charged.load(Ordering::SeqCst), 0); + Ok(()) + }).unwrap(); + } +} diff --git a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs index 77c3f74b86..3d20d187b6 100644 --- a/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs +++ b/golem-worker-executor/src/services/active_workers/concurrent_agents_scheduler.rs @@ -48,42 +48,91 @@ struct AccountSchedulerState { struct QueuedAgent { agent_id: AgentId, - waker: tokio::sync::oneshot::Sender, + waker: tokio::sync::oneshot::Sender, } -/// RAII permit returned by [`ConcurrentAgentsScheduler::acquire`]. +/// A slot granted from the scheduler: owns the underlying semaphore permit and +/// the responsibility to decrement the account's `running_count` and wake the +/// next queued agent when it is released. /// -/// On drop, decrements the account's running count and wakes the next queued -/// agent (if any). The drop handler is fully synchronous. -pub struct ConcurrentAgentPermit { +/// The `running_count` is incremented together with acquiring the raw permit, +/// and the matching decrement lives only here in `Drop`. This binds the count +/// strictly to the lifetime of the granted permit, regardless of how the slot +/// is disposed of: +/// +/// * it is moved into a [`ConcurrentAgentPermit`] and dropped when the agent +/// releases the slot (the normal case), or +/// * it is sent into a queued waiter's oneshot and that waiter is cancelled +/// before receiving it — the slot is then dropped inside the channel. +/// +/// Both paths run this same `Drop`, so a slot granted to a waiter that is +/// cancelled after the grant succeeded cannot leak the count. +struct GrantedSlot { raw: Option, - account: Option>, + account: Arc, account_id: AccountId, } -impl Drop for ConcurrentAgentPermit { +impl Drop for GrantedSlot { fn drop(&mut self) { if let Some(raw) = self.raw.take() { // Return the raw permit to the semaphore first so it is available // for the next queued agent's synchronous try-acquire. drop(raw); - - if let Some(ref account) = self.account { - try_grant_next_sync(account, &self.account_id); - } + try_grant_next_sync(&self.account, &self.account_id); } } } -impl ConcurrentAgentPermit { - /// Consumes the permit without triggering the drop notification. - #[allow(dead_code)] - pub fn into_inner(mut self) -> Option { - self.account = None; +impl GrantedSlot { + /// Take the raw permit out, suppressing this slot's `Drop` bookkeeping. + /// + /// Used only from `drain_ready_queue` when a `send` to a cancelled waiter + /// fails: the slot is returned in the `Err`, but we are still holding the + /// account state lock, so letting its `Drop` run would re-enter + /// `try_grant_next_sync` and deadlock on the same non-reentrant mutex. The + /// caller takes the permit back and performs the accounting inline instead. + fn defuse(mut self) -> Option { self.raw.take() } } +/// RAII permit returned by [`ConcurrentAgentsScheduler::acquire`]. +/// +/// On drop, decrements the account's running count and wakes the next queued +/// agent (if any) via the held [`GrantedSlot`]. Unlimited accounts hold a bare +/// permit with no slot, so dropping them touches no scheduler accounting. The +/// drop handler is fully synchronous. +pub struct ConcurrentAgentPermit { + /// `Some` for limited accounts (carries the scheduler accounting); `None` + /// for unlimited accounts, where `_raw` holds the bare bypass permit. Held + /// purely for its `Drop`, which returns the permit and wakes the next + /// queued agent. + _slot: Option, + /// Bare permit for the unlimited-account bypass path. Unused for limited + /// accounts (the permit lives inside `_slot`). + _raw: Option, +} + +impl ConcurrentAgentPermit { + /// A permit for a limited account, carrying the scheduler accounting. + fn from_slot(slot: GrantedSlot) -> Self { + Self { + _slot: Some(slot), + _raw: None, + } + } + + /// A permit for an unlimited account: a bare bypass permit with no + /// scheduler accounting. + fn unlimited(raw: OwnedSemaphorePermit) -> Self { + Self { + _slot: None, + _raw: Some(raw), + } + } +} + impl Default for ConcurrentAgentsScheduler { fn default() -> Self { Self::new() @@ -156,11 +205,7 @@ impl ConcurrentAgentsScheduler { // Unlimited accounts bypass the queue entirely. if is_unlimited(limit) { let raw = self.permits.acquire(account_id, || async { false }).await; - return ConcurrentAgentPermit { - raw: Some(raw), - account: None, - account_id, - }; + return ConcurrentAgentPermit::unlimited(raw); } // Sync the underlying semaphore pool size with the current plan limit @@ -175,16 +220,12 @@ impl ConcurrentAgentsScheduler { let limit = account.resource_entry.max_concurrent_agents_per_executor(); if is_unlimited(limit) { let raw = self.permits.acquire(account_id, || async { false }).await; - return ConcurrentAgentPermit { - raw: Some(raw), - account: None, - account_id, - }; + return ConcurrentAgentPermit::unlimited(raw); } enum AcquireDecision { FastPath(OwnedSemaphorePermit), - Queued(tokio::sync::oneshot::Receiver), + Queued(tokio::sync::oneshot::Receiver), } let decision = { @@ -197,7 +238,7 @@ impl ConcurrentAgentsScheduler { // After a plan upgrade, newly added semaphore permits may allow // queued agents to proceed. Drain what we can before deciding // about the current agent. - drain_ready_queue(&mut state, &account.raw_semaphore, limit, &account_id); + drain_ready_queue(&mut state, &account, limit, &account_id); // Fast path: capacity available, no older waiters, and the raw // semaphore actually has a permit. We try-acquire the semaphore @@ -239,26 +280,22 @@ impl ConcurrentAgentsScheduler { "ConcurrentAgentsScheduler: fast-path permit for {agent_id} in account {account_id}" ); - ConcurrentAgentPermit { + ConcurrentAgentPermit::from_slot(GrantedSlot { raw: Some(raw), - account: Some(account), + account, account_id, - } + }) } AcquireDecision::Queued(rx) => { debug!( "ConcurrentAgentsScheduler: {agent_id} queued in account {account_id}, waiting for permit" ); - let raw = rx.await.expect( + let slot = rx.await.expect( "ConcurrentAgentsScheduler: oneshot sender dropped without sending — scheduler bug", ); - ConcurrentAgentPermit { - raw: Some(raw), - account: Some(account), - account_id, - } + ConcurrentAgentPermit::from_slot(slot) } } } @@ -299,7 +336,7 @@ impl ConcurrentAgentsScheduler { /// be fully synchronous. Uses `tokio::sync::Semaphore::try_acquire_owned` /// (which is synchronous despite being on a tokio type) to acquire permits /// for queued agents. -fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) { +fn try_grant_next_sync(account: &Arc, account_id: &AccountId) { let limit = account.resource_entry.max_concurrent_agents_per_executor(); if is_unlimited(limit) { return; @@ -308,7 +345,7 @@ fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) { let mut state = account.state.lock().unwrap(); state.running_count = state.running_count.saturating_sub(1); - drain_ready_queue(&mut state, &account.raw_semaphore, limit, account_id); + drain_ready_queue(&mut state, account, limit, account_id); } /// Try to grant permits to queued agents from the front of the ready queue. @@ -316,9 +353,15 @@ fn try_grant_next_sync(account: &AccountScheduler, account_id: &AccountId) { /// Called both from `try_grant_next_sync` (Drop path) and from `acquire` /// (after a plan-upgrade sync adds new permits). Fully synchronous — only /// uses `try_acquire_owned` which does not block. +/// +/// Each granted permit is wrapped in a [`GrantedSlot`] carrying the +/// `running_count` decrement, so a waiter cancelled after a successful send +/// still releases its slot (via the slot's `Drop` when the oneshot channel is +/// dropped) rather than leaking the count. The increment here is matched +/// one-for-one by that slot's `Drop`. fn drain_ready_queue( state: &mut AccountSchedulerState, - raw_semaphore: &Arc, + account: &Arc, limit: u64, account_id: &AccountId, ) { @@ -326,13 +369,24 @@ fn drain_ready_queue( let queued = state.ready_queue.pop_front().unwrap(); // tokio::sync::Semaphore::try_acquire_owned is synchronous. - match raw_semaphore.clone().try_acquire_owned() { + match account.raw_semaphore.clone().try_acquire_owned() { Ok(raw) => { state.running_count += 1; - if queued.waker.send(raw).is_err() { - // Waiter was cancelled; the permit inside the oneshot - // is dropped, returning it to the semaphore. Decrement - // and try next. + let slot = GrantedSlot { + raw: Some(raw), + account: account.clone(), + account_id: *account_id, + }; + if let Err(slot) = queued.waker.send(slot) { + // Waiter was cancelled before we could hand it the slot. + // We are still holding the state lock, so we must not let + // the returned slot's `Drop` run (it would re-enter this + // path via `try_grant_next_sync` and deadlock). Defuse it, + // return its permit to the semaphore, and account for it + // inline, then try the next queued agent. + if let Some(raw) = slot.defuse() { + drop(raw); + } state.running_count -= 1; debug!( "ConcurrentAgentsScheduler: waiter {} cancelled in account {account_id}, trying next", diff --git a/golem-worker-executor/src/services/active_workers/memory_probe.rs b/golem-worker-executor/src/services/active_workers/memory_probe.rs new file mode 100644 index 0000000000..6940b53db4 --- /dev/null +++ b/golem-worker-executor/src/services/active_workers/memory_probe.rs @@ -0,0 +1,238 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Platform-abstracted probe of the executor's real memory usage and limit. +//! +//! Reports the measured resident memory and hard limit of the process's +//! environment, used as the authoritative input to admission decisions (in +//! contrast to the estimate-based semaphore in [`super::ActiveWorkers`]). +//! +//! The trait is abstract over where the limit comes from: a containerised Linux +//! deployment reads it from the cgroup, an unconstrained process reads host RAM, +//! a configured override pins it explicitly. Backend fidelity is asymmetric — +//! cgroup v2 gives the exact kernel-enforced number; other targets fall back to +//! best-effort process RSS via [`ProcessRssProbe`] until dedicated macOS and +//! Windows backends land. + +use std::fmt::Debug; + +/// A snapshot of the executor environment's memory state. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct MemorySnapshot { + /// Hard ceiling: cgroup `memory.max` on constrained Linux, configured cap + /// or host RAM otherwise. Reaching this with `current` triggers an + /// OOM-kill. + pub limit_bytes: u64, + /// Currently-resident bytes: cgroup `memory.current` on Linux (touched + /// pages, lagging but exact), process RSS otherwise. + pub current_bytes: u64, +} + +impl MemorySnapshot { + /// Bytes between current usage and the hard limit. Saturating: never + /// underflows if `current` momentarily exceeds the reported `limit`. + pub fn headroom_bytes(&self) -> u64 { + self.limit_bytes.saturating_sub(self.current_bytes) + } +} + +/// Reads the executor environment's real memory state. Sampled at every +/// admission attempt, including each wasmtime `memory.grow`, so it must be +/// cheap: the cgroup v2 backend is two small file reads independent of the +/// number of resident workers. +pub trait MemoryProbe: Send + Sync + Debug { + fn snapshot(&self) -> MemorySnapshot; + + fn limit_bytes(&self) -> u64 { + self.snapshot().limit_bytes + } + + fn current_bytes(&self) -> u64 { + self.snapshot().current_bytes + } + + fn headroom_bytes(&self) -> u64 { + self.snapshot().headroom_bytes() + } +} + +/// A probe whose limit is fixed at construction and whose current usage comes +/// from cross-platform process RSS via `sysinfo`. +/// +/// This is the best-effort fallback used wherever no higher-fidelity backend +/// is available yet (notably macOS and Windows). It is also used when a +/// `system_memory_override` pins the limit explicitly. +#[derive(Debug)] +pub struct ProcessRssProbe { + limit_bytes: u64, +} + +impl ProcessRssProbe { + pub fn new(limit_bytes: u64) -> Self { + Self { limit_bytes } + } + + fn current_rss() -> u64 { + let mut sysinfo = sysinfo::System::new(); + let pid = sysinfo::Pid::from_u32(std::process::id()); + sysinfo.refresh_processes(sysinfo::ProcessesToUpdate::Some(&[pid]), true); + sysinfo.process(pid).map(|p| p.memory()).unwrap_or_default() + } +} + +impl MemoryProbe for ProcessRssProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.limit_bytes, + current_bytes: Self::current_rss(), + } + } +} + +/// A probe with a fixed limit and a fixed current usage, both set at +/// construction. Reports the same snapshot on every call regardless of the +/// host. Used by the in-process test harness, where the executor shares its +/// process (and therefore its real RSS) with the test framework and other +/// services, so a process-RSS probe cannot isolate this executor's footprint. +/// Pinning `current_bytes` to a known value (typically 0) makes the gate decide +/// purely on the granted accounting against the pinned limit, which is exact and +/// process-isolated, so memory-pressure tests are deterministic. +#[derive(Debug)] +pub struct FixedProbe { + limit_bytes: u64, + current_bytes: u64, +} + +impl FixedProbe { + pub fn new(limit_bytes: u64, current_bytes: u64) -> Self { + Self { + limit_bytes, + current_bytes, + } + } +} + +impl MemoryProbe for FixedProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.limit_bytes, + current_bytes: self.current_bytes, + } + } +} + +/// Linux cgroup v2 probe. Reads `memory.max` and `memory.current` from the +/// process's cgroup. +#[cfg(target_os = "linux")] +#[derive(Debug)] +pub struct CgroupV2Probe { + /// Resolved path to the cgroup directory, e.g. `/sys/fs/cgroup`. + base: std::path::PathBuf, + /// Fallback limit used when `memory.max` reads `max` (unlimited) — usually + /// host RAM or the configured override. + fallback_limit_bytes: u64, +} + +#[cfg(target_os = "linux")] +impl CgroupV2Probe { + const DEFAULT_BASE: &'static str = "/sys/fs/cgroup"; + + /// Attempts to construct a cgroup v2 probe. Returns `None` when the host is + /// not running cgroup v2 (no unified `memory.current` at the base path), so + /// the caller can fall back to [`ProcessRssProbe`]. + pub fn try_new(fallback_limit_bytes: u64) -> Option { + let base = std::path::PathBuf::from(Self::DEFAULT_BASE); + // cgroup v2 unified hierarchy exposes memory.current directly at the + // delegated cgroup path. If it is not readable we are not on v2. + if std::fs::read_to_string(base.join("memory.current")).is_ok() { + Some(Self { + base, + fallback_limit_bytes, + }) + } else { + None + } + } + + fn read_u64(&self, file: &str) -> Option { + let raw = std::fs::read_to_string(self.base.join(file)).ok()?; + raw.trim().parse::().ok() + } + + fn read_limit(&self) -> u64 { + // memory.max contains either a number of bytes or the literal "max". + match std::fs::read_to_string(self.base.join("memory.max")) { + Ok(raw) => { + let trimmed = raw.trim(); + if trimmed == "max" { + self.fallback_limit_bytes + } else { + trimmed.parse::().unwrap_or(self.fallback_limit_bytes) + } + } + Err(_) => self.fallback_limit_bytes, + } + } +} + +#[cfg(target_os = "linux")] +impl MemoryProbe for CgroupV2Probe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.read_limit(), + current_bytes: self.read_u64("memory.current").unwrap_or(0), + } + } +} + +/// Constructs the best available probe. +/// +/// When `memory_override` is set, the limit is self-declared and treated as an +/// isolated budget measured against this process's RSS — the executor does not +/// assume it owns a cgroup. When it is `None`, the executor is assumed to own +/// its memory environment, so on Linux the exact cgroup v2 numbers are used +/// (falling back to host RAM / process RSS otherwise). +pub fn default_probe(memory_override: Option) -> Box { + if let Some(limit) = memory_override { + tracing::info!( + limit_bytes = limit, + "Memory probe: ProcessRssProbe (limit pinned by system_memory_override)" + ); + return Box::new(ProcessRssProbe::new(limit)); + } + + let host_ram = { + let mut sysinfo = sysinfo::System::new(); + sysinfo.refresh_memory(); + sysinfo.total_memory() + }; + + #[cfg(target_os = "linux")] + { + if let Some(probe) = CgroupV2Probe::try_new(host_ram) { + let snapshot = probe.snapshot(); + tracing::info!( + limit_bytes = snapshot.limit_bytes, + current_bytes = snapshot.current_bytes, + "Memory probe: CgroupV2Probe (cgroup memory.max/current)" + ); + return Box::new(probe); + } + } + tracing::info!( + limit_bytes = host_ram, + "Memory probe: ProcessRssProbe (host RAM, no cgroup v2 limit)" + ); + Box::new(ProcessRssProbe::new(host_ram)) +} diff --git a/golem-worker-executor/src/services/active_workers/mod.rs b/golem-worker-executor/src/services/active_workers/mod.rs index 3a9ece958b..24784065b4 100644 --- a/golem-worker-executor/src/services/active_workers/mod.rs +++ b/golem-worker-executor/src/services/active_workers/mod.rs @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod admission; +pub mod component_charge; pub mod concurrent_agents_scheduler; pub mod concurrent_agents_semaphore; pub mod fs_semaphore; +pub mod memory_probe; #[cfg(test)] mod tests; @@ -26,9 +29,14 @@ pub use fs_semaphore::{ filesystem_storage_permits_to_bytes, filesystem_storage_pool_bytes_to_permits, }; +pub(crate) use admission::MemoryGrant; +use admission::{AdmissionController, EvictionPriority, EvictionSource}; +use async_trait::async_trait; +pub use component_charge::HeldComponentCharge; +use component_charge::{ChargeSource, ComponentChargeGuard, ComponentChargeRegistry}; +use memory_probe::{MemoryProbe, default_probe}; use std::sync::Arc; use std::time::Duration; -use tokio::sync::{Mutex, OwnedSemaphorePermit, Semaphore, TryAcquireError}; use tracing::{Instrument, debug}; @@ -40,7 +48,7 @@ use crate::workerctx::WorkerCtx; use golem_common::cache::{BackgroundEvictionMode, Cache, FullCacheEvictionMode, SimpleCache}; use golem_common::model::account::AccountId; use golem_common::model::agent::Principal; -use golem_common::model::component::ComponentRevision; +use golem_common::model::component::{ComponentId, ComponentRevision}; use golem_common::model::environment::EnvironmentId; use golem_common::model::invocation_context::InvocationContextStack; use golem_common::model::worker::AgentConfigEntryDto; @@ -65,71 +73,96 @@ impl RegisteredConcurrentAccount { /// Holds the metadata and wasmtime structures of currently active Golem workers pub struct ActiveWorkers { workers: Cache>, WorkerExecutorError>, - worker_memory: Arc, worker_filesystem_storage: Arc, concurrent_agents: Arc, - priority_allocation_lock: Arc>, acquire_retry_delay: Duration, + /// Authoritative measured-headroom admission gate, and the sole admission + /// authority. Decides whether real memory headroom permits a new + /// acquisition, evicting via the worker set when short. `None` when measured + /// admission is disabled (e.g. shared test environments), in which case + /// acquisition always proceeds. + admission: Option>, + /// Reserves each resident component's compiled module size with the gate + /// exactly once (shared across all its workers) rather than per worker, so + /// the module's resident cost is accounted before it faults into memory. + component_charges: Arc>, + /// Multiplier applied to a component's `component_size` when sizing its + /// module charge. + component_size_coefficient: f64, } -#[derive(Debug)] -pub struct WorkerMemoryPermit { - permit: Option, -} - -impl WorkerMemoryPermit { - fn new(permit: OwnedSemaphorePermit) -> Self { - crate::metrics::workers::record_memory_permit_acquired(permit.num_permits()); - Self { - permit: Some(permit), - } - } +/// Identifies a compiled component for module-charge accounting. +type ComponentChargeKey = (ComponentId, ComponentRevision); - pub fn num_permits(&self) -> usize { - self.permit - .as_ref() - .map_or(0, |permit| permit.num_permits()) - } - - pub fn merge(&mut self, mut other: Self) { - if let Some(other_permit) = other.permit.take() { - match &mut self.permit { - Some(permit) => permit.merge(other_permit), - None => self.permit = Some(other_permit), - } - } - } -} - -impl Drop for WorkerMemoryPermit { - fn drop(&mut self) { - crate::metrics::workers::record_memory_permit_released(self.num_permits()); - } -} +/// Guard held by a resident worker keeping its component's module charge alive. +pub type WorkerComponentCharge = ComponentChargeGuard; impl ActiveWorkers { pub fn new(memory_config: &MemoryConfig, storage_config: &FilesystemStorageConfig) -> Self { - let worker_memory_size = memory_config.worker_memory(); + // Build the probe once and hand it to the measured-headroom gate, which + // bases its decision on the pod's cgroup limit when constrained (not host + // RAM). + let probe = default_probe(memory_config.system_memory_override); + Self::new_with_probe(probe, memory_config, storage_config) + } + + /// Like [`Self::new`] but with an explicitly provided memory probe instead of + /// the one derived from the config. The in-process test harness uses this to + /// supply a probe with a pinned limit and current usage, so the gate's + /// decision is deterministic and isolated from the shared test process's RSS. + pub fn new_with_probe( + probe: Box, + memory_config: &MemoryConfig, + storage_config: &FilesystemStorageConfig, + ) -> Self { + let admission = memory_config.enable_measured_admission.then(|| { + Arc::new(AdmissionController::new( + probe, + memory_config.admission_policy(), + )) + }); + let workers = Cache::new( + None, + FullCacheEvictionMode::None, + BackgroundEvictionMode::None, + "active_workers", + ); + let component_charges = ComponentChargeRegistry::new(GateChargeSource { + admission: admission.clone(), + }); let active_workers = Self { - workers: Cache::new( - None, - FullCacheEvictionMode::None, - BackgroundEvictionMode::None, - "active_workers", - ), - worker_memory: Arc::new(Semaphore::new(worker_memory_size)), + workers, worker_filesystem_storage: Arc::new(FilesystemStorageSemaphore::new( storage_config.worker_filesystem_storage(), storage_config.acquire_retry_delay, )), concurrent_agents: Arc::new(ConcurrentAgentsScheduler::new()), acquire_retry_delay: memory_config.acquire_retry_delay, - priority_allocation_lock: Arc::new(Mutex::new(())), + admission, + component_charges, + component_size_coefficient: memory_config.component_size_coefficient, }; - active_workers.initialize_metrics(worker_memory_size); + active_workers.initialize_metrics(); active_workers } + /// Acquire (or share) the per-component module charge for a worker of the + /// given component. The first resident worker of the component reserves its + /// compiled-module size (scaled by `component_size_coefficient`) with the + /// gate; subsequent workers share the same charge. The returned guard + /// releases the charge when the last worker of the component unloads. + pub async fn acquire_component_charge( + &self, + component_id: ComponentId, + component_revision: ComponentRevision, + component_module_bytes: u64, + ) -> WorkerComponentCharge { + let charge_bytes = (self.component_size_coefficient * component_module_bytes as f64) as u64; + self.component_charges + .acquire((component_id, component_revision), charge_bytes) + .await + } + pub async fn get_or_add( &self, deps: &T, @@ -202,159 +235,60 @@ impl ActiveWorkers { } } - pub async fn acquire(&self, memory: u64) -> WorkerMemoryPermit { - let mem32: u32 = memory - .try_into() - .expect("requested memory size is too large"); - + /// Blocking memory admission for a starting worker. Loops until the gate + /// admits the request, backing off between attempts, and returns a + /// [`MemoryGrant`] guard owning the reservation: the worker holds it for as + /// long as it is resident and releases it by dropping the guard, so a start + /// cancelled before the worker becomes resident cannot leak the reservation. + /// + /// A rejection is transient, not terminal. The gate reads resident memory + /// from the probe, which lags real usage (cgroup `memory.current` only counts + /// already-touched pages), so a worker admitted earlier may not yet be fully + /// resident; pressure eases as its pages settle and as other workers finish. + /// Each iteration backs off and re-reads the gate, so the caller eventually + /// proceeds once headroom recovers rather than failing under momentary + /// pressure. With measured admission disabled the worker is admitted + /// immediately with an inert grant. + pub(crate) async fn acquire(&self, memory: u64) -> MemoryGrant { + let Some(admission) = &self.admission else { + return MemoryGrant::inert(); + }; loop { - let available = self.worker_memory.available_permits(); - let lock = self.priority_allocation_lock.lock().await; // Block trying until a priority request is retrying once - let result = self.worker_memory.clone().try_acquire_many_owned(mem32); - drop(lock); - match result { - Ok(permit) => { - debug!( - "Acquired {} memory of {}, new available: {}, permit size: {}", - mem32, - available, - self.worker_memory.available_permits(), - permit.num_permits() - ); - break WorkerMemoryPermit::new(permit); - } - Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"), - Err(TryAcquireError::NoPermits) => { - debug!( - "Not enough memory to allocate {mem32} (available: {}), trying to free some up", - self.worker_memory.available_permits() - ); - if self.try_free_up_memory(memory).await { - debug!("Freed up some memory, retrying"); - // We have enough memory unless another worker has taken it in the meantime, - // so retry the loop - continue; - } else { - debug!( - "Could not free up memory, retrying asking for permits after some time" - ); - // Could not free up enough memory, so waiting for permits to be available. - // We cannot use acquire_many() to wait for the permits because it eagerly preallocates - // the available permits, and by that causing deadlocks. So we sleep and retry. - - tokio::time::sleep(self.acquire_retry_delay).await; - } - } + // Evicts idle-then-warm when real headroom is short; rejects (and we + // back off) when it cannot make room rather than risking the limit. + if let Some(grant) = admission.admit(memory, &self.eviction_source()).await { + return grant; } + debug!("Measured headroom insufficient for {memory}, backing off and retrying"); + tokio::time::sleep(self.acquire_retry_delay).await; } } - pub async fn try_acquire(&self, memory: u64) -> Option { - let mem32: u32 = memory - .try_into() - .expect("requested memory size is too large"); - let mut lock = None; - loop { - match self.worker_memory.clone().try_acquire_many_owned(mem32) { - Ok(permit) => { - debug!( - "Acquired {} memory of {}", - mem32, - self.worker_memory.available_permits() - ); - break Some(WorkerMemoryPermit::new(permit)); - } - Err(TryAcquireError::Closed) => panic!("worker memory semaphore has been closed"), - Err(TryAcquireError::NoPermits) => { - if lock.is_none() { - debug!( - "Not enough available memory to acquire {mem32} (available: {}), cancelling waiting acquires and retry", - self.worker_memory.available_permits() - ); - lock = Some(self.priority_allocation_lock.lock().await); - continue; - } else { - debug!( - "Not enough available memory to acquire {mem32} (available: {})", - self.worker_memory.available_permits() - ); - break None; - } - } - } + /// Builds an [`EvictionSource`] view over the live worker set for the + /// admission controller to reclaim memory through. + fn eviction_source(&self) -> WorkerEvictionSource { + WorkerEvictionSource { + workers: self.workers.clone(), } } - async fn try_free_up_memory(&self, memory: u64) -> bool { - let current_avail = self.worker_memory.available_permits(); - let needed = memory.saturating_sub(current_avail as u64); - - if needed > 0 { - let mut idle_candidates = Vec::new(); - let mut warm_candidates = Vec::new(); - - debug!("Collecting memory eviction candidates"); - let pairs = self.workers.iter().await; - for (agent_id, worker) in pairs { - if let Some(class) = worker.eviction_class().await - && let Ok(mem) = worker.memory_requirement().await - { - let last_changed = worker.last_execution_state_change(); - let entry = (agent_id, worker, mem, last_changed); - match class { - crate::worker::EvictionClass::LoadedIdle => { - idle_candidates.push(entry); - } - crate::worker::EvictionClass::WarmRunnable => { - warm_candidates.push(entry); - } - } - } - } - - // Sort each bucket by timestamp — newest first so we pop oldest - idle_candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis()); - idle_candidates.reverse(); - warm_candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis()); - warm_candidates.reverse(); - - let mut freed = 0u64; - - // First evict LoadedIdle workers (cheapest) - while freed < needed && !idle_candidates.is_empty() { - let (agent_id, worker, mem, _) = idle_candidates.pop().unwrap(); - debug!("Trying to stop idle {agent_id} to free up memory"); - if worker - .stop_if_evictable(crate::worker::EvictionClass::LoadedIdle) - .await - { - debug!("Stopped idle {agent_id} to free up {mem} memory"); - crate::metrics::workers::record_worker_eviction("LoadedIdle"); - freed += mem; - } - } - - // Then evict WarmRunnable workers if still under pressure - while freed < needed && !warm_candidates.is_empty() { - let (agent_id, worker, mem, _) = warm_candidates.pop().unwrap(); - debug!("Trying to stop warm-runnable {agent_id} to free up memory"); - if worker - .stop_if_evictable(crate::worker::EvictionClass::WarmRunnable) - .await - { - debug!("Stopped warm-runnable {agent_id} to free up {mem} memory"); - crate::metrics::workers::record_worker_eviction("WarmRunnable"); - freed += mem; - } - } - - if freed > 0 { - debug!("Freed up {freed}"); + /// Non-blocking memory admission for a growing worker. A single gate attempt: + /// returns the additional [`MemoryGrant`] when the grow is admitted, or `None` + /// when real headroom is insufficient even after eviction (the caller turns + /// `None` into a retriable out-of-memory trap). The returned grant should be + /// merged into the worker's existing grant so its whole reservation is + /// released together on unload. With measured admission disabled the grow is + /// always admitted with an inert grant. + pub(crate) async fn try_acquire(&self, memory: u64) -> Option { + let Some(admission) = &self.admission else { + return Some(MemoryGrant::inert()); + }; + match admission.admit(memory, &self.eviction_source()).await { + Some(grant) => Some(grant), + None => { + debug!("Measured headroom insufficient for {memory}, not admitting"); + None } - freed >= needed - } else { - debug!("Memory was freed up in the meantime"); - true } } @@ -471,11 +405,111 @@ impl ActiveWorkers { } /// Initializes worker gauges. Subsequent changes are recorded inline at the mutation sites. - fn initialize_metrics(&self, worker_memory_size: usize) { + fn initialize_metrics(&self) { crate::metrics::workers::initialize_worker_metrics(); crate::metrics::workers::set_filesystem_semaphore_available( self.worker_filesystem_storage.available_bytes(), ); - crate::metrics::storage::record_worker_memory_pool_total(worker_memory_size as u64); + } +} + +impl From for crate::worker::EvictionClass { + fn from(priority: EvictionPriority) -> Self { + match priority { + EvictionPriority::Idle => crate::worker::EvictionClass::LoadedIdle, + EvictionPriority::Warm => crate::worker::EvictionClass::WarmRunnable, + } + } +} + +/// Evicts resident workers at a single priority tier, oldest-first, stopping +/// once at least `needed_bytes` have been freed or the tier is exhausted. +/// Returns the bytes actually reclaimed. +async fn evict_at_most_memory( + workers: &Cache>, WorkerExecutorError>, + priority: EvictionPriority, + needed_bytes: u64, +) -> u64 { + let target_class: crate::worker::EvictionClass = priority.into(); + + let mut candidates = Vec::new(); + for (agent_id, worker) in workers.iter().await { + if let Some(class) = worker.eviction_class().await + && class == target_class + && let Ok(mem) = worker.memory_requirement().await + { + let last_changed = worker.last_execution_state_change(); + candidates.push((agent_id, worker, mem, last_changed)); + } + } + + // Sort by timestamp newest-first so we pop the oldest first. + candidates.sort_by_key(|(_, _, _, ts)| ts.to_millis()); + candidates.reverse(); + + let mut freed = 0u64; + while freed < needed_bytes && !candidates.is_empty() { + let (agent_id, worker, mem, _) = candidates.pop().unwrap(); + debug!("Trying to stop {target_class:?} {agent_id} to free up memory"); + if worker.stop_if_evictable(target_class).await { + debug!("Stopped {target_class:?} {agent_id} to free up {mem} memory"); + crate::metrics::workers::record_worker_eviction(match priority { + EvictionPriority::Idle => "LoadedIdle", + EvictionPriority::Warm => "WarmRunnable", + }); + freed += mem; + } + } + freed +} + +/// A source of evictable, already-resident memory the gate reclaims through. +struct WorkerEvictionSource { + workers: Cache>, WorkerExecutorError>, +} + +#[async_trait] +impl EvictionSource for WorkerEvictionSource { + async fn evict_at_most(&self, priority: EvictionPriority, needed_bytes: u64) -> u64 { + evict_at_most_memory(&self.workers, priority, needed_bytes).await + } +} + +/// Production [`ChargeSource`] for the per-component module charge: reserves the +/// module's bytes with the measured-headroom gate. The module is a committed +/// consequence of admitting the first worker of a component (it loads into RAM +/// when that worker becomes resident), so it is reserved rather than admitted — +/// it neither evicts nor can be refused. `None` when measured admission is +/// disabled, in which case the charge is a no-op. +pub struct GateChargeSource { + admission: Option>, +} + +/// Held module charge: releases its reserved bytes from the gate on drop. +pub struct GateCharge { + admission: Option>, + bytes: u64, +} + +impl Drop for GateCharge { + fn drop(&mut self) { + if let Some(admission) = &self.admission { + admission.release(self.bytes); + } + } +} + +#[async_trait] +impl ChargeSource for GateChargeSource { + type Charge = GateCharge; + + async fn acquire_charge(&self, bytes: u64) -> GateCharge { + if let Some(admission) = &self.admission { + admission.reserve_committed(bytes); + } + GateCharge { + admission: self.admission.clone(), + bytes, + } } } diff --git a/golem-worker-executor/src/services/active_workers/tests.rs b/golem-worker-executor/src/services/active_workers/tests.rs index 82430c243b..217c0e21b6 100644 --- a/golem-worker-executor/src/services/active_workers/tests.rs +++ b/golem-worker-executor/src/services/active_workers/tests.rs @@ -729,3 +729,556 @@ async fn scheduler_accounts_are_independent() { drop(a1); drop(a2); } + +// ── Component module charge against the admission gate ─────────────────────── + +mod component_module_charge { + use super::super::admission::{AdmissionController, AdmissionPolicy}; + use super::super::component_charge::ComponentChargeRegistry; + use super::super::memory_probe::{MemoryProbe, MemorySnapshot}; + use super::super::{ComponentChargeKey, GateChargeSource, HeldComponentCharge}; + use golem_common::model::component::{ComponentId, ComponentRevision}; + use std::sync::Arc; + use test_r::test; + use uuid::Uuid; + + /// Probe reporting a fixed limit and zero resident memory, so the gate's + /// reservation is driven entirely by what is charged through it. + #[derive(Debug)] + struct FixedProbe { + limit: u64, + } + + impl MemoryProbe for FixedProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.limit, + current_bytes: 0, + } + } + } + + fn key() -> ComponentChargeKey { + (ComponentId(Uuid::new_v4()), ComponentRevision::INITIAL) + } + + /// The first worker of a component reserves the module's bytes with the gate, + /// so admissible headroom drops by the module size before it faults into + /// memory. A second worker of the same component reserves nothing more, and + /// the reservation is released only when the last worker unloads. + #[test] + async fn module_charge_reserves_with_gate_until_last_worker_unloads() { + let limit = 1000u64; + let module_bytes = 200u64; + let controller = Arc::new(AdmissionController::new( + Box::new(FixedProbe { limit }), + AdmissionPolicy { usable_ratio: 1.0 }, + )); + let registry = ComponentChargeRegistry::new(GateChargeSource { + admission: Some(controller.clone()), + }); + let component = key(); + + assert_eq!(controller.headroom_bytes(), limit); + + let first = registry.acquire(component, module_bytes).await; + assert_eq!( + controller.headroom_bytes(), + limit - module_bytes, + "first worker of a component must reserve the module size with the gate" + ); + + let second = registry.acquire(component, module_bytes).await; + assert_eq!( + controller.headroom_bytes(), + limit - module_bytes, + "a second worker of the same component must not reserve the module again" + ); + + drop(first); + assert_eq!( + controller.headroom_bytes(), + limit - module_bytes, + "the module stays reserved while any worker of the component is resident" + ); + + drop(second); + assert_eq!( + controller.headroom_bytes(), + limit, + "the module reservation is released when the last worker unloads" + ); + } + + /// A `RunningWorker` stores its component charge as + /// `Box` and releases it by dropping that box when + /// the worker unloads. Dropping the box must still release the module + /// reservation with the gate, i.e. the concrete charge's release runs through + /// the trait object exactly as it would for a live worker. + #[test] + async fn dropping_boxed_charge_releases_the_reservation() { + let limit = 1000u64; + let module_bytes = 200u64; + let controller = Arc::new(AdmissionController::new( + Box::new(FixedProbe { limit }), + AdmissionPolicy { usable_ratio: 1.0 }, + )); + let registry = ComponentChargeRegistry::new(GateChargeSource { + admission: Some(controller.clone()), + }); + + let charge = registry.acquire(key(), module_bytes).await; + // Store it exactly as RunningWorker does. + let boxed: Box = Box::new(charge); + assert_eq!(controller.headroom_bytes(), limit - module_bytes); + + drop(boxed); + assert_eq!( + controller.headroom_bytes(), + limit, + "dropping the boxed charge (as on worker unload) must release the reservation" + ); + } +} + +// ── ConcurrentAgentsScheduler — model-based liveness property ──────────────── +// +// The scheduler keeps its own `running_count` integer alongside the real tokio +// semaphore permits. The two must stay in lockstep: every increment of +// `running_count` must be matched by exactly one decrement, regardless of how a +// granted slot is disposed of (released by a live worker, or dropped inside a +// cancelled waiter's oneshot channel). If they drift, the scheduler wedges — +// `running_count` sticks at the limit while permits are actually free, and +// every future acquire queues forever. This is the production deadlock the +// property is designed to catch. +// +// The model drives random interleavings of acquire / release / cancel against +// the real scheduler and, after every step, asserts the *liveness* invariant: +// whenever fewer permits are genuinely held than the limit allows, a fresh +// acquire must succeed promptly. A leaked `running_count` violates this. +mod scheduler_liveness { + use super::super::concurrent_agents_scheduler::{ + ConcurrentAgentPermit, ConcurrentAgentsScheduler, + }; + use super::{account, agent, resource_entry_with_agent_limit}; + use proptest::prelude::*; + use std::sync::Arc; + use std::time::Duration; + use test_r::test; + use tokio::task::JoinHandle; + + /// One step in a randomized scheduler workload. + #[derive(Debug, Clone)] + enum Op { + /// Acquire a permit and hold it (resolves immediately if capacity is + /// free, otherwise the in-flight acquire is parked in `pending`). + Acquire, + /// Release a currently-held permit, if any. + Release(prop::sample::Index), + /// Cancel an in-flight (likely queued) acquire, if any. Exercises both + /// "cancelled while queued" and "cancelled just after being granted". + CancelPending(prop::sample::Index), + /// Release a held permit and, in the same step, cancel an in-flight + /// acquire. This is the deadly race: the released slot may be granted + /// to the in-flight acquire's oneshot and then the acquire is cancelled + /// before it can receive it. The slot must still be released. + ReleaseThenCancel(prop::sample::Index, prop::sample::Index), + } + + fn arb_ops() -> impl Strategy> { + prop::collection::vec( + prop_oneof![ + 3 => Just(Op::Acquire), + 2 => any::().prop_map(Op::Release), + 2 => any::().prop_map(Op::CancelPending), + 3 => (any::(), any::()) + .prop_map(|(a, b)| Op::ReleaseThenCancel(a, b)), + ], + 1..60, + ) + } + + /// Let any synchronous grant/drain bookkeeping triggered by a release or + /// cancellation settle before the next observation. + async fn settle() { + for _ in 0..8 { + tokio::task::yield_now().await; + } + tokio::time::sleep(Duration::from_millis(1)).await; + } + + proptest! { + // Cap shrink iterations so a failing (buggy) run cannot spend minutes + // re-running wedging inputs against the overall timeout while shrinking. + #![proptest_config(ProptestConfig { cases: 128, max_shrink_iters: 64, ..ProptestConfig::default() })] + + /// Liveness: under any interleaving of acquire / release / cancel, the + /// scheduler never wedges. After each step, if fewer permits are held + /// than the limit, a fresh acquire must succeed within a short timeout. + /// At the end, draining all held permits must let the account return to + /// full capacity. + #[test] + fn scheduler_never_wedges_under_churn( + limit in 1usize..6, + ops in arb_ops(), + ) { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_time() + .build() + .unwrap(); + + rt.block_on(async move { + // Bound the whole case so a wedge fails fast and deterministically + // rather than hanging the test suite. A correct scheduler completes + // a 60-op workload in well under a second; the bug deadlocks here, + // so a tight bound makes the failure (and any shrinking) quick. + let outcome = tokio::time::timeout(Duration::from_secs(3), async move { + run_workload(limit, ops).await + }) + .await; + + match outcome { + Ok(result) => result, + Err(_elapsed) => Err(TestCaseError::fail( + "scheduler workload did not complete within the overall timeout — \ + deadlock (running_count leaked above true occupancy)", + )), + } + })?; + } + } + + /// Drives one randomized workload against a freshly-registered account and + /// returns `Err` if the liveness invariant is ever violated. Factored out of + /// the proptest body so the whole run can be wrapped in an overall timeout. + async fn run_workload(limit: usize, ops: Vec) -> Result<(), TestCaseError> { + // Short per-acquire timeout: a wedge must surface quickly, but allow + // enough slack for genuine multi-thread scheduling jitter. + const PROBE_TIMEOUT: Duration = Duration::from_millis(500); + + let sched = Arc::new(ConcurrentAgentsScheduler::new()); + let acc = account(); + sched + .register_account(acc, resource_entry_with_agent_limit(limit as u64)) + .await; + + // Permits we are deliberately holding (count against the limit). + let mut held: Vec = Vec::new(); + // In-flight acquires not yet resolved (queued or just granted). + let mut pending: Vec> = Vec::new(); + let mut counter = 0usize; + + for op in ops { + match op { + Op::Acquire => { + counter += 1; + let sched = sched.clone(); + let name = format!("W{counter}"); + let handle = + tokio::spawn(async move { sched.acquire(acc, agent(&name)).await }); + pending.push(handle); + } + Op::Release(idx) => { + if !held.is_empty() { + let i = idx.index(held.len()); + drop(held.remove(i)); + } + } + Op::CancelPending(idx) => { + if !pending.is_empty() { + let i = idx.index(pending.len()); + pending.remove(i).abort(); + } + } + Op::ReleaseThenCancel(ri, ci) => { + if !held.is_empty() { + let i = ri.index(held.len()); + drop(held.remove(i)); + } + if !pending.is_empty() { + let i = ci.index(pending.len()); + pending.remove(i).abort(); + } + } + } + + settle().await; + + // Collect any in-flight acquires that have now resolved into + // held permits, so `held.len()` reflects true occupancy. + let mut still_pending = Vec::new(); + for h in pending.drain(..) { + if h.is_finished() { + if let Ok(permit) = h.await { + held.push(permit); + } + // Cancelled/aborted handles are simply dropped. + } else { + still_pending.push(h); + } + } + pending = still_pending; + + // Liveness invariant: if we are below the limit, a fresh + // acquire must succeed promptly. A leaked running_count + // would make this hang and trip the timeout. + if held.len() < limit { + let probe = + tokio::time::timeout(PROBE_TIMEOUT, sched.acquire(acc, agent("probe"))).await; + prop_assert!( + probe.is_ok(), + "scheduler wedged: held {} < limit {} but acquire timed out", + held.len(), + limit, + ); + // Release the probe immediately. + drop(probe.ok()); + settle().await; + } + } + + // Abort everything still queued, drop all held permits, and + // confirm the account drains back to full capacity: `limit` + // fresh acquires must all succeed. + for h in pending.drain(..) { + h.abort(); + let _ = h.await; + } + held.clear(); + settle().await; + + let mut drained = Vec::new(); + for _ in 0..limit { + let p = tokio::time::timeout(PROBE_TIMEOUT, sched.acquire(acc, agent("drain"))).await; + prop_assert!( + p.is_ok(), + "scheduler did not return to full capacity after churn", + ); + drained.push(p.unwrap()); + } + Ok(()) + } +} + +// ── Grant-guard liveness under random churn ────────────────────────────────── +// +// A worker's memory grant is reserved with the admission gate and then owned by +// a guard that lives in one of three places over the worker's lifetime: in the +// in-flight start task (waiting for permits), in the resident worker (started), +// or dropped (the worker exited or its start was cancelled). The liveness +// invariant — mirroring `scheduler_liveness` for the concurrent-agents scheduler +// — is that however the guard travels between those places, the gate's +// accounting stays symmetric: once every guard is gone, admissible headroom +// returns to the full ceiling. A reservation released zero times (leak, the +// cancelled-while-waiting deletion bug) or more than once (double-release) breaks +// it. With a zero-usage probe, headroom is `ceiling - granted`, so the final +// headroom reads the granted total directly. +mod grant_guard_liveness { + use super::super::admission::{ + AdmissionController, AdmissionPolicy, EvictionPriority, EvictionSource, MemoryGrant, + }; + use crate::services::active_workers::memory_probe::{MemoryProbe, MemorySnapshot}; + use proptest::prelude::*; + use std::sync::Arc; + use std::time::Duration; + use test_r::test; + use tokio::task::JoinHandle; + + /// Probe with a fixed limit reporting zero resident usage, so admissible + /// headroom equals `ceiling - granted` and reads the granted accounting + /// directly — the quantity a leaked or double-released grant corrupts. + #[derive(Debug)] + struct ZeroUsageProbe { + limit: u64, + } + + impl MemoryProbe for ZeroUsageProbe { + fn snapshot(&self) -> MemorySnapshot { + MemorySnapshot { + limit_bytes: self.limit, + current_bytes: 0, + } + } + } + + /// Nothing to evict: a rejected request stays rejected (the schedule keeps + /// total grants within the ceiling so admission only fails transiently, never + /// due to a leak the gate could not see). + struct NoEvictionSource; + + #[async_trait::async_trait] + impl EvictionSource for NoEvictionSource { + async fn evict_at_most(&self, _priority: EvictionPriority, _needed_bytes: u64) -> u64 { + 0 + } + } + + /// One step in a randomized grant-lifecycle workload. + #[derive(Debug, Clone)] + enum Op { + /// Begin a worker start: spawn a task that acquires a grant of this many + /// bytes and then parks holding it, as a worker waits for its remaining + /// permits before becoming resident. + Start(u64), + /// A still-in-flight start becomes resident: its task yields the grant + /// guard, which we keep (the worker is now running). + Resident(prop::sample::Index), + /// Cancel a still-in-flight start, as deleting a waiting worker does: + /// abort the task, dropping the grant guard it held. + CancelStart(prop::sample::Index), + /// A resident worker exits: drop its grant guard. + Exit(prop::sample::Index), + } + + fn arb_ops() -> impl Strategy> { + prop::collection::vec( + prop_oneof![ + 4 => (1u64..50).prop_map(Op::Start), + 2 => any::().prop_map(Op::Resident), + 3 => any::().prop_map(Op::CancelStart), + 2 => any::().prop_map(Op::Exit), + ], + 1..80, + ) + } + + /// An in-flight start: the task runs admission, reports the outcome back over + /// `ready` (the grant on admit, `None` if the gate rejected it), then parks + /// holding the grant. The driver can take the grant (the worker became + /// resident) or abort the task (the start was cancelled, dropping any grant + /// inside the task). The outcome is always reported, so the driver never + /// blocks waiting on a start that was rejected. + struct InFlight { + handle: JoinHandle<()>, + ready: tokio::sync::oneshot::Receiver>, + } + + /// Drive one randomized workload and assert headroom recovers to the ceiling + /// once every grant guard is gone. + async fn run_workload(limit: u64, ops: Vec) -> Result<(), TestCaseError> { + let controller = Arc::new(AdmissionController::new( + Box::new(ZeroUsageProbe { limit }), + AdmissionPolicy { usable_ratio: 1.0 }, + )); + + let mut in_flight: Vec = Vec::new(); + let mut resident: Vec = Vec::new(); + + for op in ops { + match op { + Op::Start(bytes) => { + let controller = controller.clone(); + let (tx, rx) = tokio::sync::oneshot::channel(); + let handle = tokio::spawn(async move { + // Always report the admission outcome so the driver never + // blocks on a start that was rejected. On admit the grant + // travels to the driver (held in the channel until taken + // as resident or dropped on cancel); on reject we report + // `None`. + let outcome = controller.admit(bytes, &NoEvictionSource).await; + let _ = tx.send(outcome); + // Park so the task stays alive until the driver decides + // its fate (become resident, or be aborted on cancel). + std::future::pending::<()>().await; + }); + in_flight.push(InFlight { handle, ready: rx }); + } + Op::Resident(idx) => { + if !in_flight.is_empty() { + let i = idx.index(in_flight.len()); + let started = in_flight.remove(i); + // Becoming resident requires the start to have been + // admitted. Take the grant if there is one (worker is now + // running); a rejected start cannot become resident and is + // simply discarded. Either way abort the parked task. + if let Ok(Some(grant)) = started.ready.await { + resident.push(grant); + } + started.handle.abort(); + let _ = started.handle.await; + } + } + Op::CancelStart(idx) => { + if !in_flight.is_empty() { + let i = idx.index(in_flight.len()); + let started = in_flight.remove(i); + // Delete a waiting worker: abort the task and drop the + // `InFlight`. Any grant the start acquired is held in + // `started.ready`; dropping it returns the reservation, + // exactly as aborting a waiting worker mid-flight does. + started.handle.abort(); + let _ = started.handle.await; + drop(started.ready); + } + } + Op::Exit(idx) => { + if !resident.is_empty() { + let i = idx.index(resident.len()); + drop(resident.remove(i)); + } + } + } + // Let acquires/aborts settle so the granted accounting is observable. + for _ in 0..4 { + tokio::task::yield_now().await; + } + } + + // Tear everything down: abort remaining starts, drop remaining resident + // grants. The environment is now empty. + for started in in_flight.drain(..) { + started.handle.abort(); + let _ = started.handle.await; + } + resident.clear(); + // Let the final drops' releases settle. + tokio::time::sleep(Duration::from_millis(20)).await; + + let headroom = controller.headroom_bytes(); + prop_assert_eq!( + headroom, + limit, + "headroom did not recover to ceiling {} after all grants were released (got {}); \ + a grant leaked or was double-released across the lifecycle", + limit, + headroom + ); + + // And the gate must be live again: a fresh full-ceiling admission fits. + let readmit = controller.admit(limit, &NoEvictionSource).await; + prop_assert!( + readmit.is_some(), + "gate refused a full-ceiling admission after draining; headroom is wedged" + ); + Ok(()) + } + + proptest! { + #![proptest_config(ProptestConfig { cases: 128, max_shrink_iters: 64, ..ProptestConfig::default() })] + + /// Liveness: under any interleaving of start / become-resident / + /// cancel-start / exit, once every grant guard is gone the gate's + /// admissible headroom returns to the full ceiling and admits again. A + /// grant that leaks on cancellation (or is released twice) breaks this. + #[test] + fn grants_never_leak_under_random_churn( + limit in 200u64..4000, + ops in arb_ops(), + ) { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_time() + .build() + .unwrap(); + + rt.block_on(async move { + tokio::time::timeout(Duration::from_secs(10), run_workload(limit, ops)) + .await + .unwrap_or_else(|_| Err(TestCaseError::fail( + "grant workload did not complete within the timeout", + ))) + })?; + } + } +} diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index 733d9529af..7e6ca1a298 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -73,6 +73,11 @@ pub struct GolemConfig { pub max_websocket_connections: usize, pub http_address: String, pub http_port: u16, + /// How often tokio runtime metrics are sampled from the runtime and pushed + /// into the metrics recorder exposed on `/metrics`. Prometheus scrapes the + /// rendered values independently; this is the in-process resolution. + #[serde(with = "humantime_serde")] + pub runtime_metrics_sampling_interval: Duration, } impl SafeDisplay for GolemConfig { @@ -284,6 +289,7 @@ impl Default for GolemConfig { max_websocket_connections: 100, http_address: "0.0.0.0".to_string(), http_port: 8082, + runtime_metrics_sampling_interval: Duration::from_secs(5), } } } @@ -963,28 +969,31 @@ pub struct MemoryConfig { pub system_memory_override: Option, pub worker_memory_ratio: f64, pub worker_estimate_coefficient: f64, + /// Multiplier applied to a component's `component_size` when reserving its + /// compiled-module memory with the admission gate, charged once per resident + /// component (shared across all its workers) rather than per worker. + pub component_size_coefficient: f64, + /// Whether the measured-headroom admission gate is active. Requires the + /// executor to own its memory environment (its own cgroup/process), as in a + /// production pod. Disable in shared environments — such as the in-process + /// test harness — where the probe cannot isolate this executor's footprint + /// from co-resident processes. + pub enable_measured_admission: bool, #[serde(with = "humantime_serde")] pub acquire_retry_delay: Duration, pub oom_retry_config: RetryConfig, } impl MemoryConfig { - pub fn total_system_memory(&self) -> u64 { - self.system_memory_override.unwrap_or_else(|| { - let mut sysinfo = sysinfo::System::new(); - sysinfo.refresh_memory(); - sysinfo.total_memory() - }) - } - - pub fn system_memory(&self) -> u64 { - let mut sysinfo = sysinfo::System::new(); - sysinfo.refresh_memory(); - sysinfo.available_memory() - } - - pub fn worker_memory(&self) -> usize { - (self.total_system_memory() as f64 * self.worker_memory_ratio) as usize + /// The admission policy for the measured-headroom gate. Reuses + /// `worker_memory_ratio` as the usable fraction of the measured limit (the + /// host keeps the remainder). + pub(crate) fn admission_policy( + &self, + ) -> crate::services::active_workers::admission::AdmissionPolicy { + crate::services::active_workers::admission::AdmissionPolicy { + usable_ratio: self.worker_memory_ratio, + } } } @@ -1004,6 +1013,16 @@ impl SafeDisplay for MemoryConfig { "worker estimate coefficient: {}", self.worker_estimate_coefficient ); + let _ = writeln!( + &mut result, + "component size coefficient: {}", + self.component_size_coefficient + ); + let _ = writeln!( + &mut result, + "measured admission enabled: {}", + self.enable_measured_admission + ); let _ = writeln!( &mut result, "acquire retry delay: {:?}", @@ -1528,6 +1547,8 @@ impl Default for MemoryConfig { system_memory_override: None, worker_memory_ratio: 0.8, worker_estimate_coefficient: 1.1, + component_size_coefficient: 2.0, + enable_measured_admission: true, acquire_retry_delay: Duration::from_millis(500), oom_retry_config: RetryConfig { max_attempts: u32::MAX, diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs index 1e6d4fa7cc..23ba710f81 100644 --- a/golem-worker-executor/src/worker/mod.rs +++ b/golem-worker-executor/src/worker/mod.rs @@ -27,7 +27,8 @@ use crate::durable_host::recover_stderr_logs; use crate::metrics::storage::record_filesystem_pool_released; use crate::model::{AgentConfig, ExecutionStatus, LookupResult, ReadFileResult, TrapType}; use crate::services::active_workers::{ - FilesystemStoragePermit, RegisteredConcurrentAccount, WorkerMemoryPermit, + FilesystemStoragePermit, HeldComponentCharge, MemoryGrant, RegisteredConcurrentAccount, + WorkerComponentCharge, }; use crate::services::events::{Event, EventsSubscription}; use crate::services::golem_config::SnapshotPolicy; @@ -58,6 +59,7 @@ use golem_common::model::agent::{ AgentMode, ParsedAgentId, Principal, Snapshotting, SnapshottingConfig, }; use golem_common::model::component::CanonicalFilePath; +use golem_common::model::component::ComponentId; use golem_common::model::component::ComponentRevision; use golem_common::model::invocation_context::InvocationContextStack; use golem_common::model::oplog::{OplogEntry, OplogIndex, UpdateDescription}; @@ -410,6 +412,12 @@ impl Worker { WorkerInstance::Unloaded { .. } => { this.mark_as_loading(); crate::metrics::workers::inc_worker_waiting_for_memory(); + crate::metrics::wasm::record_worker_resident_linear_memory( + this.get_latest_worker_metadata() + .await + .last_known_status + .total_linear_memory_size, + ); *instance_guard = WorkerInstance::WaitingForPermit(WaitingWorker::new( this.clone(), this.memory_requirement().await?, @@ -789,15 +797,29 @@ impl Worker { self.execution_status.read().unwrap().agent_mode() } - /// Gets the estimated memory requirement of the worker + /// Gets the estimated memory requirement of the worker. + /// + /// This covers only the per-worker linear memory. The compiled component + /// module is shared by all workers of a component and is charged once per + /// resident component via the component-charge registry, not per worker. pub async fn memory_requirement(&self) -> Result { let metadata = self.get_latest_worker_metadata().await; - let ml = metadata.last_known_status.total_linear_memory_size as f64; - let sw = metadata.last_known_status.component_size as f64; - let c = 2.0; - let x = self.worker_estimate_coefficient; - Ok((x * (ml + c * sw)) as u64) + let linear_memory_bytes = metadata.last_known_status.total_linear_memory_size as f64; + let estimate_coefficient = self.worker_estimate_coefficient; + Ok((estimate_coefficient * linear_memory_bytes) as u64) + } + + /// Returns the component identity and compiled-module size used to charge + /// the shared module memory once per resident component. + pub async fn component_charge_requirement( + &self, + ) -> Result<(ComponentId, ComponentRevision, u64), WorkerExecutorError> { + let metadata = self.get_latest_worker_metadata().await; + let component_id = self.owned_agent_id.component_id(); + let component_revision = metadata.last_known_status.component_revision; + let component_module_bytes = metadata.last_known_status.component_size; + Ok((component_id, component_revision, component_module_bytes)) } /// Gets the storage requirement of the worker based on the last known status. @@ -963,20 +985,39 @@ impl Worker { // Should only be called from invocation loop pub async fn increase_memory(&self, delta: u64) -> anyhow::Result<()> { + // The instance lock must not be held while running the admission gate: + // it may run the eviction scan, which takes other workers' instance + // locks. Holding this worker's instance lock across that scan while + // another growing worker does the same is an AB-BA deadlock. So check the + // state, release the lock, then run the gate. + match &*self.instance.lock().await { + WorkerInstance::Running(_) => {} + WorkerInstance::Stopping(_) + | WorkerInstance::WaitingForPermit(_) + | WorkerInstance::Unloaded { .. } + | WorkerInstance::Deleting => return Ok(()), + } + + let Some(extra_grant) = self.active_workers().try_acquire(delta).await else { + crate::metrics::workers::record_worker_memory_grow_rejected(); + return Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory)); + }; + + // Re-check state under the lock: the worker may have changed state while + // the gate ran. If it is still running, merge the extra grant into the + // running worker so its whole reservation releases together on unload. + // Otherwise drop `extra_grant` here, returning the reservation to the + // gate, and treat the grow as a no-op (matching the non-running arms). match &mut *self.instance.lock().await { WorkerInstance::Running(running) => { - if let Some(new_permits) = self.active_workers().try_acquire(delta).await { - running.merge_extra_permits(new_permits); - Ok(()) - } else { - Err(anyhow!(GolemSpecificWasmTrap::WorkerOutOfMemory)) - } + running.merge_extra_memory_grant(extra_grant); } - WorkerInstance::Stopping(_) => Ok(()), - WorkerInstance::WaitingForPermit(_) => Ok(()), - WorkerInstance::Unloaded { .. } => Ok(()), - WorkerInstance::Deleting => Ok(()), + WorkerInstance::Stopping(_) + | WorkerInstance::WaitingForPermit(_) + | WorkerInstance::Unloaded { .. } + | WorkerInstance::Deleting => {} } + Ok(()) } /// Return `freed_bytes` to the storage semaphore pool. @@ -1627,11 +1668,15 @@ impl Worker { // when stopping via the invocation loop we can stop immediately, no need to go via the stopping status if called_from_invocation_loop { crate::metrics::workers::dec_worker_memory_resident(); + // Dropping `running` at the end of this arm releases its + // memory grant (and component/storage permits) back to the + // gate. **instance_guard = final_state.into_instance(); StopResult::Stopped } else { // drop the running worker, this signals to the invocation loop to start exiting. - // RunningWorker::drop releases the memory permit, so dec resident here. + // `stop()` consumes the RunningWorker and drops everything but + // its join handle, releasing its memory grant back to the gate. let run_loop_handle = running.stop(); let notify = OneShotEvent::new(); crate::metrics::workers::dec_worker_memory_resident(); @@ -2183,7 +2228,8 @@ impl Worker { async fn start_waiting_worker( this: Arc>, - permit: WorkerMemoryPermit, + memory_grant: MemoryGrant, + component_charge: WorkerComponentCharge, filesystem_storage_permit: Option, concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit, oom_retry_count: u32, @@ -2198,7 +2244,8 @@ impl Worker { this.owned_agent_id.clone(), this.queue.clone(), this.clone(), - permit, + memory_grant, + component_charge, concurrent_agent_permit, oom_retry_count, ) @@ -2212,6 +2259,8 @@ impl Worker { } _ => { debug!("worker was not waiting for permit anymore, not starting"); + // The worker is not becoming resident: dropping `memory_grant` + // here returns its reservation to the gate. } } } @@ -2349,10 +2398,41 @@ impl WaitingWorker { let agent_id = parent.owned_agent_id.agent_id(); let registered_concurrent_account = parent.registered_concurrent_account.clone(); let concurrent_agent_permit = registered_concurrent_account.acquire(agent_id).await; - // Do not reserve executor memory while waiting for a per-account - // concurrency slot. Otherwise one account could fill the memory - // pool with workers that are not allowed to run yet. - let permit = parent.active_workers().acquire(memory_requirement).await; + // Do not gate executor memory while waiting for a per-account + // concurrency slot. Otherwise one account could exhaust the + // memory headroom with workers that are not allowed to run yet. + // + // `memory_grant` owns the reservation from here on: it is held as + // a local until the worker becomes resident (when it moves into + // the RunningWorker) or this task ends/aborts (when dropping it + // returns the reservation to the gate). This is what makes a + // start cancelled mid-flight — e.g. the worker being deleted while + // still waiting for its remaining permits — release rather than + // leak its grant. + let memory_grant = parent.active_workers().acquire(memory_requirement).await; + // Reserve the component's compiled module size once per resident + // component (shared by all its workers). Held for as long as this + // worker is resident; the module faults into RAM when the first + // worker loads, so reserving it keeps later admissions honest. + let component_charge = match parent.component_charge_requirement().await { + Ok((component_id, component_revision, component_module_bytes)) => { + parent + .active_workers() + .acquire_component_charge( + component_id, + component_revision, + component_module_bytes, + ) + .await + } + Err(err) => { + warn!( + "Failed to determine component charge requirement, not starting: {err}" + ); + // Dropping `memory_grant` here returns its reservation. + return; + } + }; // Pre-acquire storage permits for this restart. // // We need to acquire `filesystem_storage_requirement + desired_extra` total: @@ -2403,7 +2483,8 @@ impl WaitingWorker { debug!("Attempting to start worker after acquiring enough permits"); Worker::start_waiting_worker( parent, - permit, + memory_grant, + component_charge, filesystem_storage_permit, concurrent_agent_permit, oom_retry_count, @@ -2435,7 +2516,18 @@ struct RunningWorker { handle: Option>, sender: UnboundedSender, queue: Arc>>, - permit: WorkerMemoryPermit, + /// The worker's memory reservation with the admission gate, covering its + /// initial requirement plus any grow deltas merged in. Held only to be + /// dropped: dropping it (on stop, eviction, or this worker being dropped for + /// any reason) returns the reservation to the gate, keeping the granted total + /// symmetric with what was reserved. + #[allow(dead_code)] + memory_grant: MemoryGrant, + /// Keeps this worker's component module charge alive while it is resident. + /// Held only to be dropped: dropping it releases the component's residency + /// (and the module reservation if this was the last worker of the component). + #[allow(dead_code)] + component_charge: Box, /// Storage semaphore permits held by this worker. `None` until storage /// space is first acquired (at startup or on first write). Dropped /// automatically when `RunningWorker` is dropped, returning storage @@ -2466,7 +2558,8 @@ impl RunningWorker { owned_agent_id: OwnedAgentId, queue: Arc>>, parent: Arc>, - permit: WorkerMemoryPermit, + memory_grant: MemoryGrant, + component_charge: WorkerComponentCharge, concurrent_agent_permit: crate::services::active_workers::ConcurrentAgentPermit, oom_retry_count: u32, ) -> Self { @@ -2516,7 +2609,8 @@ impl RunningWorker { handle: Some(handle), sender, queue, - permit, + memory_grant, + component_charge: Box::new(component_charge), filesystem_storage_permit: None, waiting_for_command, interrupt_signal, @@ -2524,8 +2618,11 @@ impl RunningWorker { } } - pub fn merge_extra_permits(&mut self, extra_permit: WorkerMemoryPermit) { - self.permit.merge(extra_permit); + /// Merge an additional memory grant (from a successful grow) into this + /// worker's grant, so its whole reservation is released together when the + /// worker unloads. + pub fn merge_extra_memory_grant(&mut self, extra: MemoryGrant) { + self.memory_grant.merge(extra); } /// Merge additional storage permits into this worker's storage permit. If diff --git a/golem-worker-executor/tests/resource_limits.rs b/golem-worker-executor/tests/resource_limits.rs index 58377cba3b..a816beb39e 100644 --- a/golem-worker-executor/tests/resource_limits.rs +++ b/golem-worker-executor/tests/resource_limits.rs @@ -186,11 +186,14 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish( let context = TestContext::new(last_unique_id); let executor = start_with_concurrent_agent_limit(deps, &context, 1).await?; - // HTTP server that gates its /poll response behind a Notify. + // HTTP server that gates its /poll response behind a zero-permit semaphore. // HttpClient2.start_polling polls GET /poll until the body equals "done". - // By holding the Notify unreleased we keep a1 in the Running state - // for as long as needed, preventing eviction and holding the only permit. - let gate = std::sync::Arc::new(tokio::sync::Notify::new()); + // The handler blocks acquiring a permit, so by withholding the permit we keep + // a1 in the Running state for as long as needed, preventing eviction and + // holding the only permit. A semaphore is used rather than a Notify so the + // release is not sensitive to whether the request's waiter is registered + // before the release call. + let gate = std::sync::Arc::new(tokio::sync::Semaphore::new(0)); let gate_clone = gate.clone(); let listener = tokio::net::TcpListener::bind("0.0.0.0:0").await?; let port = listener.local_addr()?.port(); @@ -200,7 +203,10 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish( get(move || { let gate = gate_clone.clone(); async move { - gate.notified().await; + gate.acquire() + .await + .expect("gate semaphore closed") + .forget(); "done".to_string() } }), @@ -259,7 +265,7 @@ async fn concurrent_agent_limit_waits_for_running_agent_to_finish( // Release the gate — a1's poll loop returns "done", its invocation // completes, and its permit is returned to the semaphore via Drop. // This unblocks a2 from WaitingForPermit. - gate.notify_waiters(); + gate.add_permits(1); // Wait for a1 to become Idle (invocation done, permit released). executor @@ -320,7 +326,13 @@ async fn concurrent_agent_idle_releases_permit( let executor = start_with_concurrent_agent_limit(deps, &context, 1).await?; // --- HTTP gate: keeps a1 provably Running until we release it. --- - let gate = std::sync::Arc::new(tokio::sync::Notify::new()); + // A zero-permit semaphore is used rather than a Notify so the release is not + // sensitive to whether the request's waiter is registered before the release + // call: a permit added before the handler reaches `acquire` is simply waiting + // for it. The handler blocks on `acquire` and only returns once the test adds + // a permit, so a1 stays Running (blocked in /poll) until then regardless of + // how the runner schedules the tasks. + let gate = std::sync::Arc::new(tokio::sync::Semaphore::new(0)); let gate_clone = gate.clone(); let listener = tokio::net::TcpListener::bind("0.0.0.0:0").await?; let port = listener.local_addr()?.port(); @@ -330,7 +342,12 @@ async fn concurrent_agent_idle_releases_permit( get(move || { let gate = gate_clone.clone(); async move { - gate.notified().await; + // Consume one permit permanently so a single added permit + // releases exactly one poll, not a recycled one. + gate.acquire() + .await + .expect("gate semaphore closed") + .forget(); "done".to_string() } }), @@ -387,7 +404,7 @@ async fn concurrent_agent_idle_releases_permit( // Release the gate. a1's poll returns "done", invocation completes, a1 goes Idle. // With the fix: Idle transition drops the permit → semaphore notifies a2 → a2 starts. // With the bug: a1 stays Idle but holds permit → a2 remains blocked forever. - gate.notify_waiters(); + gate.add_permits(1); // a2 should now be unblocked (fix) or remain stuck (bug). // Give it 15 seconds — well beyond what starting a counter agent takes. diff --git a/integration-tests/benchmark_suites/cloud-density-saturation.yaml b/integration-tests/benchmark_suites/cloud-density-saturation.yaml new file mode 100644 index 0000000000..1d7a477661 --- /dev/null +++ b/integration-tests/benchmark_suites/cloud-density-saturation.yaml @@ -0,0 +1,68 @@ +# Cloud throughput-saturation benchmark suite. +# +# Unlike cloud-perf's throughput benchmarks (which keep `size` small enough that +# all workers fit in memory), this suite deliberately ramps the number of +# active, memory-holding agents up to and past the executor's memory ceiling to +# find the per-pod active-agent capacity and the throughput sustained once +# memory is exhausted. +# +# Each agent retains a deterministic, per-agent-distinct amount of resident +# memory, so the fleet presents a mix of footprints near the limit (exercising +# the admission/eviction path). The measured phase drives one in-flight +# `busy_for` call per agent and records aggregate throughput. +# +# Run with the benchmarks binary's `cloud` subcommand (same flags as cloud-perf): +# +# benchmarks suite integration-tests/benchmark_suites/cloud-density-saturation.yaml \ +# --save-to-json result.json \ +# cloud --api-url https:// --apps-base-domain \ +# --admin-account-token --builtin-plugin-owner-account-id \ +# --default-plan-id --component-directory +# +# Reading the result: plot `saturation-throughput-ops-per-sec` and +# invocation-retries/timeouts against `size`. Throughput climbs with `size` +# until the pod's memory is exhausted, then plateaus or drops while retries and +# eviction churn rise — that knee is the active-agent ceiling. +# +# `clusterSize` is ignored in cloud mode (single observed cluster). + +name: cloud-density-saturation +benchmarks: + # # Rust echo agents — lean per-instance linear memory (the ~900 KB module is + # # charged once per component, shared across all agents; what scales per agent + # # is the small instance heap). The previous run reached the top of the sweep + # # (12000) without saturating pod memory, so the knee here is throughput / + # # eviction-churn rather than memory. Dropped the low points that told us + # # nothing and pushed the range up with coarser steps. + # - name: throughput-saturation-echo-rust + # iterations: 3 + # clusterSize: [2] + # size: [2000, 3000, 4000, 5000, 10000, 15000, 20000] + # length: [0] + + # # TypeScript echo agents — each instance instantiates its own QuickJS runtime + # # and JS heap in its own linear memory (the 17.4 MB module is shared once per + # # component; the per-instance runtime state is the heavy per-agent cost). + # # Heavier per agent than the Rust variant, so a lower knee — but the previous + # # run reached 2000 without saturating, so push higher and drop the low points. + # - name: throughput-saturation-echo-ts + # iterations: 3 + # clusterSize: [2] + # size: [1000, 2000, 3000] + # length: [0] + + # Synthetic footprint — each agent retains a deterministic per-agent-distinct + # amount of resident memory, exercising the admission/eviction path with a + # controllable footprint near the limit. Run first: this is the variant that + # actually fills memory and drives the gate to its reject/evict path. + # size = number of active, memory-holding agents (the ramp axis) + # length = base per-agent memory footprint in bytes; each agent retains a + # deterministic multiple (1x..8x), averaging ~4.5x. 16 MiB base => + # ~72 MiB average per agent, filling a ~10 GiB usable pool around + # ~145 agents. The sweep brackets that ceiling and pushes well past it + # so the admission gate's reject/evict behaviour near OOM is exercised. + - name: throughput-saturation-counters + iterations: 1 + clusterSize: [2] + size: [50, 100, 150, 200, 300] + length: [16777216] diff --git a/integration-tests/benchmark_suites/cloud-perf.yaml b/integration-tests/benchmark_suites/cloud-perf.yaml new file mode 100644 index 0000000000..ef8dd7d61f --- /dev/null +++ b/integration-tests/benchmark_suites/cloud-perf.yaml @@ -0,0 +1,130 @@ +# Cloud-perf benchmark suite — runs the full benchmark suite against a +# deployed Golem environment via Gateway-API hostnames (TestMode::Cloud). +# +# Run with the benchmarks binary's `cloud` subcommand: +# +# benchmarks suite integration-tests/benchmark_suites/cloud-perf.yaml \ +# --save-to-json result.json \ +# cloud \ +# --api-url https:// \ +# --apps-base-domain \ +# --admin-account-token \ +# --builtin-plugin-owner-account-id \ +# --default-plan-id \ +# --component-directory +# +# Note: clusterSize is ignored in Cloud mode (the observed cluster size is +# read from shard-manager at run start and recorded in result metadata). +# +# Suite order rationale: throughput benchmarks run first because they involve +# RPC worker pairs and HTTP deployments — the most complex setup. Running them +# early surfaces infrastructure issues (stuck workers, port-forward drops) +# before spending time on the simpler benchmarks. + +name: cloud-perf +benchmarks: + # Throughput — measures invocation throughput across six implementations: + # rust agent (gRPC), TS agent (gRPC), rust agent (HTTP), TS agent (HTTP), + # TS RPC pair, rust RPC pair. + # size = number of workers per implementation (×6 implementations total) + # length = unused for echo + - name: throughput-echo + iterations: 3 + clusterSize: [2] + size: [1, 50, 100, 250] + length: [1000] + + # size = number of workers per implementation + # length = payload size in bytes sent to large_input + # NOTE: large payloads grow worker linear memory, so this is the throughput + # benchmark most relevant to the memory-admission investigation — sized to + # match throughput-echo so it exercises real density. + - name: throughput-large-input + iterations: 3 + clusterSize: [2] + size: [1, 50, 100, 250] + length: [100, 10000] + + # size = number of workers per implementation + # length = CPU work length passed to cpu_intensive + - name: throughput-cpu-intensive + iterations: 3 + clusterSize: [2] + size: [1, 50, 100, 250] + length: [100] + + # Cold-start: compilation cache disabled — measures true cold-start latency + # with no warm compiled artefact available. + # size = number of unique components created (each in its own env) + # length = seconds to wait per component for pre-compilation warm-up + - name: cold-start-unknown-small + iterations: 3 + clusterSize: [2] + size: [1, 5, 25, 50] + length: [2] + disableCompilationCache: true + + - name: cold-start-unknown-medium + iterations: 3 + clusterSize: [2] + size: [1, 5, 25, 50] + length: [5] + disableCompilationCache: true + + # Cold-start: compilation cache enabled — measures latency once the compiled + # artefact is available in the cache. + # size = number of unique components created (each in its own env) + # length = seconds to wait per component for pre-compilation warm-up + # NOTE: if results here are close to the cache-disabled entries above, the + # warm-up wait is too short and compilation hasn't finished — bump length. + - name: cold-start-unknown-small + iterations: 3 + clusterSize: [2] + size: [1, 5, 25, 50] + length: [2] + + - name: cold-start-unknown-medium + iterations: 3 + clusterSize: [2] + size: [1, 5, 25, 50] + length: [5] + + # Invocation latency — hot and cold paths through the Gateway NLB. + # Large worker counts to stress the load balancer and connection pool. + # size = number of workers created + # length = number of hot invocations per worker after the first cold one + - name: latency-small + iterations: 3 + clusterSize: [2] + size: [100, 500, 1000, 2000, 5000] + length: [2] + + - name: latency-medium + iterations: 3 + clusterSize: [2] + size: [100, 500, 1000, 2000] + length: [5] + + # Sleep — measures worker suspension and resumption under real network + # conditions. High residency: all `size` workers held in memory sleeping at + # once, so this also probes how many resident workers fit (memory-admission + # relevant) — pushed past the ~2000 echo proved out. + # size = number of workers launched in parallel + # length = sleep duration in milliseconds + - name: sleep + iterations: 3 + clusterSize: [2] + size: [10, 100, 500, 1000, 2000] + length: [10000] + + # Durability overhead — measures the cost of durable vs ephemeral execution + # across four variants (durable-persistent, durable-non-persistent, + # ephemeral, durable-persistent-commit). size workers concurrent per phase; + # sized up to put real load on the oplog/persistence/storage path. + # size = number of workers per variant + # length = loop iteration count passed to oplog_heavy + - name: durability-overhead + iterations: 3 + clusterSize: [2] + size: [10, 50, 100, 200] + length: [5000] diff --git a/integration-tests/src/benchmarks/all.rs b/integration-tests/src/benchmarks/all.rs index 91d972534d..e79ac78612 100644 --- a/integration-tests/src/benchmarks/all.rs +++ b/integration-tests/src/benchmarks/all.rs @@ -13,16 +13,28 @@ // limitations under the License. use clap::Parser; +use golem_client::api::RegistryServiceClient; +use golem_common::base_model::agent::ParsedAgentId; +use golem_common::model::AgentId; +use golem_common::model::application::{ApplicationCreation, ApplicationName}; +use golem_common::model::environment::{EnvironmentCreation, EnvironmentName}; +use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{ Benchmark, BenchmarkApi, BenchmarkConfig, BenchmarkResult, BenchmarkSuite, BenchmarkSuiteItem, - BenchmarkSuiteResult, + BenchmarkSuiteResult, RunMetadata, +}; +use golem_test_framework::config::benchmark::{TestMode, cloud_bench_run_id}; +use golem_test_framework::config::{ + BenchmarkCliParameters, BenchmarkTestDependencies, TestDependencies, +}; +use golem_test_framework::dsl::{TestDsl, TestDslExtended}; +use integration_tests::benchmarks::{ + cleanup_account, cleanup_user_state, delete_workers, invoke_and_await_agent, }; -use golem_test_framework::config::benchmark::TestMode; -use golem_test_framework::config::{BenchmarkCliParameters, BenchmarkTestDependencies}; use std::collections::BTreeMap; use std::future::Future; use std::pin::Pin; -use tracing::{Level, debug, info}; +use tracing::{Level, debug, info, warn}; type RunFn = Box< dyn for<'a> Fn( @@ -121,6 +133,30 @@ async fn main() { >(mode, verbosity, item, primary_only, otlp)) }), ); + benchmarks_by_name.insert( + "throughput-saturation-counters", + Box::new(|mode, verbosity, item, primary_only, otlp| { + Box::pin(run_benchmark::< + integration_tests::benchmarks::throughput_saturation::ThroughputSaturationCounters, + >(mode, verbosity, item, primary_only, otlp)) + }), + ); + benchmarks_by_name.insert( + "throughput-saturation-echo-rust", + Box::new(|mode, verbosity, item, primary_only, otlp| { + Box::pin(run_benchmark::< + integration_tests::benchmarks::throughput_saturation::ThroughputSaturationEchoRust, + >(mode, verbosity, item, primary_only, otlp)) + }), + ); + benchmarks_by_name.insert( + "throughput-saturation-echo-ts", + Box::new(|mode, verbosity, item, primary_only, otlp| { + Box::pin(run_benchmark::< + integration_tests::benchmarks::throughput_saturation::ThroughputSaturationEchoTs, + >(mode, verbosity, item, primary_only, otlp)) + }), + ); let params = BenchmarkCliParameters::parse_from(std::env::args_os()); let tracer_provider = BenchmarkTestDependencies::init_logging(¶ms); @@ -144,7 +180,14 @@ async fn main() { length: length.clone(), disable_compilation_cache: Some(*disable_compilation_cache), }; - let result = f( + + cloud_preflight_warmup( + params.benchmark_config.mode(), + params.service_verbosity(), + params.otlp, + ) + .await; + let mut result = f( params.benchmark_config.mode(), params.service_verbosity(), &item, @@ -152,6 +195,10 @@ async fn main() { params.otlp, ) .await; + // Attach the run_id to result metadata (cloud mode only). + if let Some(run_id) = cloud_bench_run_id() { + result.run_id = Some(format!("bench-{run_id}")); + } if params.json { let str = serde_json::to_string(&result) .expect("Failed to serialize BenchmarkResult"); @@ -174,9 +221,27 @@ async fn main() { let suite: BenchmarkSuite = serde_yaml::from_str(&raw_suite).expect("Failed to parse benchmark suite"); + // Validate every benchmark name up-front so a typo exits immediately + // without running warmup or any prior benchmark. + for benchmark in &suite.benchmarks { + if !benchmarks_by_name.contains_key(benchmark.name.as_str()) { + print_non_existing_benchmark(&mut benchmarks_by_name, &benchmark.name); + // print_non_existing_benchmark calls std::process::exit(1) + unreachable!(); + } + } + + // Pre-flight warmup runs after all names are validated. + cloud_preflight_warmup( + params.benchmark_config.mode(), + params.service_verbosity(), + params.otlp, + ) + .await; + let mut suite_result = BenchmarkSuiteResult::new(&suite.name); for benchmark in suite.benchmarks { - info!("Running {benchmark:?}"); // TODO + info!("Running {benchmark:?}"); if let Some(f) = benchmarks_by_name.get(benchmark.name.as_str()) { let result = f( @@ -188,8 +253,19 @@ async fn main() { ) .await; suite_result.add(result); - } else { - print_non_existing_benchmark(&mut benchmarks_by_name, &benchmark.name); + } + // no else: we already validated all names above + } + + // Attach the run_id and run_metadata to result metadata (cloud mode only). + if let Some(run_id) = cloud_bench_run_id() { + suite_result.run_id = Some(format!("bench-{run_id}")); + + // Read GOLEM_BENCH_* env vars set by the buildspec before invoking + // the binary. Missing vars produce None rather than failing the run. + let metadata = RunMetadata::from_env(); + if !metadata.is_empty() { + suite_result.run_metadata = Some(metadata); } } @@ -241,3 +317,164 @@ async fn run_benchmark( ) -> BenchmarkResult { B::run_benchmark(mode, verbosity, item, primary_only, otlp).await } + +// ── Pre-flight warmup constants ─────────────────────────────────────────────── + +/// WASM file name (without `.wasm`) of the component used for warmup +/// invocations. Must be present in `--component-directory`. +const WARMUP_COMPONENT_WASM: &str = "benchmark_agent_rust_release"; +/// Registry display name for the warmup component. +const WARMUP_COMPONENT_NAME: &str = "benchmark:agent-rust"; +/// Agent type whose `echo` method is invoked during warmup. +const WARMUP_AGENT_TYPE: &str = "RustBenchmarkAgent"; +/// Instance ID of the throwaway warmup agent. +const WARMUP_AGENT_INSTANCE: &str = "warmup"; +/// Total wall-clock budget for the 50 warmup invocations. If the budget +/// fires (e.g. the platform is slow to cold-start on the first invocation) +/// a warning is logged and the benchmark continues — warmup is best-effort. +const WARMUP_BUDGET: std::time::Duration = std::time::Duration::from_secs(180); + +/// Pre-flight warmup for cloud mode. Runs once at suite/benchmark start; +/// is a no-op for all non-cloud modes. +/// +/// Executes 50 throwaway `invoke_and_await_agent` calls against a short-lived +/// user/env/component. Each call exercises the full stack: +/// gateway → registry-service (component lookup) → worker-service +/// → worker-executor, warming NLB target-group routing and HTTP/2 sessions at +/// every hop so they don't contaminate the first measured iteration. +/// +/// The entire invocation phase is bounded by a 3-minute timeout. If the +/// timeout fires (e.g. because of a gateway routing issue on the first cold +/// start), a warning is logged and the benchmark continues — warm-up is +/// best-effort. +/// +/// If uploading the warmup component fails (e.g. the file is absent from the +/// component directory), a warning is logged and the agent-invocation phase +/// is skipped; the throwaway account is still cleaned up. +async fn cloud_preflight_warmup(mode: &TestMode, verbosity: Level, otlp: bool) { + if !matches!(mode, TestMode::Cloud { .. }) { + return; + } + + info!("Pre-flight warmup: creating throwaway user/env/component (50 invocations)..."); + + let deps = BenchmarkTestDependencies::new(mode, verbosity, 0, false, otlp).await; + + let user = match deps.user().await { + Ok(u) => u, + Err(e) => { + warn!("Pre-flight warmup: failed to create user (skipping): {e:?}"); + deps.kill_all().await; + return; + } + }; + + let registry_client = user.registry_service_client().await; + let prefix = user.deps.bench_name_prefix().unwrap_or_default(); + + let app = match registry_client + .create_application( + &user.account_id.0, + &ApplicationCreation { + name: ApplicationName(format!("{prefix}app-warmup")), + }, + ) + .await + { + Ok(a) => a, + Err(e) => { + warn!("Pre-flight warmup: failed to create app (skipping): {e:?}"); + cleanup_account(&user).await; + deps.kill_all().await; + return; + } + }; + + let env = match registry_client + .create_environment( + &app.id.0, + &EnvironmentCreation { + name: EnvironmentName(format!("{prefix}env-warmup")), + compatibility_check: false, + version_check: false, + security_overrides: false, + }, + ) + .await + { + Ok(e) => e, + Err(e) => { + warn!("Pre-flight warmup: failed to create env (skipping): {e:?}"); + // delete app explicitly before account (cascading delete is incomplete) + if let Err(del_err) = registry_client + .delete_application(&app.id.0, app.revision.into()) + .await + { + warn!( + "Pre-flight warmup: failed to delete app {} after env-creation \ + failure (best-effort, app may be orphaned): {del_err:?}", + app.id.0 + ); + } + cleanup_account(&user).await; + deps.kill_all().await; + return; + } + }; + + let component = match user + .component(&env.id, WARMUP_COMPONENT_WASM) + .name(WARMUP_COMPONENT_NAME) + .store() + .await + { + Ok(c) => c, + Err(e) => { + warn!( + "Pre-flight warmup: failed to upload warmup component \ + ({WARMUP_COMPONENT_WASM}.wasm) — ensure it exists in the \ + component directory: {e:?}" + ); + cleanup_user_state(&user, &env.id).await; + deps.kill_all().await; + return; + } + }; + + let warmup_agent: ParsedAgentId = agent_id!(WARMUP_AGENT_TYPE, WARMUP_AGENT_INSTANCE); + + // Bound the 50 invocations with a total wall-clock budget. + let invoke_result = tokio::time::timeout(WARMUP_BUDGET, async { + for i in 0..50usize { + let result = invoke_and_await_agent( + &user, + &component, + &warmup_agent, + "echo", + data_value!("warmup"), + ) + .await; + info!( + "Pre-flight warmup invocation {}/50: {}ms", + i + 1, + result.accumulated_time.as_millis() + ); + } + }) + .await; + + if invoke_result.is_err() { + warn!( + "Pre-flight warmup: invocation phase timed out after {}s (continuing anyway)", + WARMUP_BUDGET.as_secs() + ); + } + + if let Ok(worker_id) = AgentId::from_agent_id(component.id, &warmup_agent) { + delete_workers(&user, &[worker_id]).await; + } + cleanup_user_state(&user, &env.id).await; + deps.kill_all().await; + + info!("Cloud pre-flight warmup complete."); +} diff --git a/integration-tests/src/benchmarks/cleanup.rs b/integration-tests/src/benchmarks/cleanup.rs new file mode 100644 index 0000000000..2047b06c4d --- /dev/null +++ b/integration-tests/src/benchmarks/cleanup.rs @@ -0,0 +1,529 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Cleanup helpers for cloud-perf benchmarks. +//! +//! The [`CleanupClient`] trait is the narrow interface used by the cascading +//! cleanup logic, which enables unit-testing with the [`MockCleanupClient`] +//! below. + +use async_trait::async_trait; +use golem_client::api::RegistryServiceClient; +use golem_common::model::environment::EnvironmentId; +use golem_test_framework::config::dsl_impl::TestUserContext; +use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies}; +use tracing::warn; +use uuid::Uuid; + +// ── Narrow trait ───────────────────────────────────────────────────────────── + +/// Narrow client interface covering only the operations used by the cascading +/// cleanup helpers. Use [`RegistryCleanupAdapter`] to wrap a real client and +/// [`MockCleanupClient`] (in tests) to inject failures. +#[async_trait] +pub trait CleanupClient: Send + Sync { + /// Returns `(component_id, revision)` pairs for all components in the env. + async fn list_env_components(&self, env_id: &Uuid) -> anyhow::Result>; + async fn delete_component(&self, id: &Uuid, revision: u64) -> anyhow::Result<()>; + + /// Returns domain-registration IDs for the env. + async fn list_env_domain_registrations(&self, env_id: &Uuid) -> anyhow::Result>; + async fn delete_domain_registration(&self, id: &Uuid) -> anyhow::Result<()>; + + /// Returns `(application_id, env_revision)` for the environment. + async fn get_env_app_id_and_revision(&self, env_id: &Uuid) -> anyhow::Result<(Uuid, u64)>; + async fn delete_environment(&self, env_id: &Uuid, revision: u64) -> anyhow::Result<()>; + + /// Returns the application's current revision. + async fn get_application_revision(&self, app_id: &Uuid) -> anyhow::Result; + async fn delete_application(&self, app_id: &Uuid, revision: u64) -> anyhow::Result<()>; + + /// Returns the account's current revision. + async fn get_account_revision(&self, account_id: &Uuid) -> anyhow::Result; + async fn delete_account(&self, account_id: &Uuid, revision: u64) -> anyhow::Result<()>; +} + +// ── Real adapter ───────────────────────────────────────────────────────────── + +/// Wraps any `RegistryServiceClient` implementor and bridges it to +/// [`CleanupClient`]. +pub struct RegistryCleanupAdapter { + inner: C, +} + +impl RegistryCleanupAdapter { + pub fn new(inner: C) -> Self { + Self { inner } + } +} + +#[async_trait] +impl CleanupClient for RegistryCleanupAdapter { + async fn list_env_components(&self, env_id: &Uuid) -> anyhow::Result> { + let page = self + .inner + .list_environment_components(env_id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; + Ok(page + .values + .into_iter() + .map(|c| (c.id.0, c.revision.into())) + .collect()) + } + + async fn delete_component(&self, id: &Uuid, revision: u64) -> anyhow::Result<()> { + self.inner + .delete_component(id, revision) + .await + .map_err(|e| anyhow::anyhow!("{e:?}")) + } + + async fn list_env_domain_registrations(&self, env_id: &Uuid) -> anyhow::Result> { + let page = self + .inner + .list_environment_domain_registrations(env_id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; + Ok(page.values.into_iter().map(|dr| dr.id.0).collect()) + } + + async fn delete_domain_registration(&self, id: &Uuid) -> anyhow::Result<()> { + self.inner + .delete_domain_registration(id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}")) + } + + async fn get_env_app_id_and_revision(&self, env_id: &Uuid) -> anyhow::Result<(Uuid, u64)> { + let env = self + .inner + .get_environment(env_id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; + Ok((env.application_id.0, env.revision.into())) + } + + async fn delete_environment(&self, env_id: &Uuid, revision: u64) -> anyhow::Result<()> { + self.inner + .delete_environment(env_id, revision) + .await + .map_err(|e| anyhow::anyhow!("{e:?}")) + } + + async fn get_application_revision(&self, app_id: &Uuid) -> anyhow::Result { + let app = self + .inner + .get_application(app_id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; + Ok(app.revision.into()) + } + + async fn delete_application(&self, app_id: &Uuid, revision: u64) -> anyhow::Result<()> { + self.inner + .delete_application(app_id, revision) + .await + .map_err(|e| anyhow::anyhow!("{e:?}")) + } + + async fn get_account_revision(&self, account_id: &Uuid) -> anyhow::Result { + let account = self + .inner + .get_account(account_id) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; + Ok(account.revision.into()) + } + + async fn delete_account(&self, account_id: &Uuid, revision: u64) -> anyhow::Result<()> { + self.inner + .delete_account(account_id, revision) + .await + .map(|_| ()) + .map_err(|e| anyhow::anyhow!("{e:?}")) + } +} + +// ── Core cleanup logic (testable via CleanupClient) ─────────────────────────── + +/// Steps 1–4 of the cascading cleanup: components → domain registrations → +/// environment → application. Does **not** delete the account. +/// +/// Every step is best-effort: failures are warned and cleanup continues. +/// +/// **Note:** Server-side cascading delete is incomplete (golemcloud/golem#3291). +pub async fn cleanup_env_and_app_with(client: &dyn CleanupClient, env_id: &Uuid) { + // Step 1: components + match client.list_env_components(env_id).await { + Ok(components) => { + for (cid, rev) in components { + if let Err(e) = client.delete_component(&cid, rev).await { + warn!("cleanup: delete component {cid} failed (best-effort): {e:?}"); + } + } + } + Err(e) => warn!("cleanup: list components for env {env_id} failed (best-effort): {e:?}"), + } + + // Step 2: domain registrations + match client.list_env_domain_registrations(env_id).await { + Ok(ids) => { + for id in ids { + if let Err(e) = client.delete_domain_registration(&id).await { + warn!("cleanup: delete domain registration {id} failed (best-effort): {e:?}"); + } + } + } + Err(e) => { + warn!( + "cleanup: list domain registrations for env {env_id} failed \ + (best-effort): {e:?}" + ) + } + } + + // Step 3: environment (also captures app_id for step 4) + let app_id = match client.get_env_app_id_and_revision(env_id).await { + Ok((app_id, rev)) => { + if let Err(e) = client.delete_environment(env_id, rev).await { + warn!("cleanup: delete environment {env_id} failed (best-effort): {e:?}"); + } + Some(app_id) + } + Err(e) => { + warn!("cleanup: get environment {env_id} failed (best-effort): {e:?}"); + None + } + }; + + // Step 4: application (only when app_id is known from step 3) + if let Some(app_id) = app_id { + match client.get_application_revision(&app_id).await { + Ok(rev) => { + if let Err(e) = client.delete_application(&app_id, rev).await { + warn!("cleanup: delete application {app_id} failed (best-effort): {e:?}"); + } + } + Err(e) => { + warn!("cleanup: get application {app_id} failed (best-effort): {e:?}") + } + } + } +} + +/// Step 5 of the cascading cleanup: deletes the user account. +pub async fn cleanup_account_with(client: &dyn CleanupClient, account_id: &Uuid) { + match client.get_account_revision(account_id).await { + Ok(rev) => { + if let Err(e) = client.delete_account(account_id, rev).await { + warn!("cleanup: delete account {account_id} failed (best-effort): {e:?}"); + } + } + Err(e) => { + warn!("cleanup: get account {account_id} failed (best-effort): {e:?}") + } + } +} + +// ── High-level wrappers (take a TestUserContext) ────────────────────────────── + +/// Steps 1–4: components, domain registrations, environment, application. +/// +/// For benchmarks whose iterations create one user with multiple envs/apps +/// (e.g. cold-start-unknown), call this once per env then call +/// [`cleanup_account`] once at the end. +pub async fn cleanup_env_and_app( + user: &TestUserContext, + env_id: &EnvironmentId, +) { + let client = user.deps.registry_service().client(&user.token).await; + let adapter = RegistryCleanupAdapter::new(client); + cleanup_env_and_app_with(&adapter, &env_id.0).await; +} + +/// Step 5: deletes the user account. +pub async fn cleanup_account(user: &TestUserContext) { + let client = user.deps.registry_service().client(&user.token).await; + let adapter = RegistryCleanupAdapter::new(client); + cleanup_account_with(&adapter, &user.account_id.0).await; +} + +/// Convenience wrapper for the common single-env-per-user case: +/// [`cleanup_env_and_app`] followed by [`cleanup_account`]. +pub async fn cleanup_user_state( + user: &TestUserContext, + env_id: &EnvironmentId, +) { + cleanup_env_and_app(user, env_id).await; + cleanup_account(user).await; +} + +// ── Unit tests ──────────────────────────────────────────────────────────────── + +#[cfg(test)] +pub mod tests { + use super::*; + use std::collections::HashSet; + use std::sync::{Arc, Mutex}; + use test_r::test; + + fn block_on(f: F) -> F::Output { + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(f) + } + + /// In-process mock that records every operation attempted and fails the + /// operations listed in `fail_ops`. + pub struct MockCleanupClient { + fail_ops: HashSet<&'static str>, + /// Ordered log of every operation attempted. + pub calls: Arc>>, + /// The `application_id` returned by `get_env_app_id_and_revision` + /// (used to verify step-4 precondition propagation in tests). + pub app_id: Uuid, + } + + impl MockCleanupClient { + pub fn new(fail_ops: &[&'static str]) -> (Self, Arc>>) { + let calls = Arc::new(Mutex::new(Vec::new())); + let mock = Self { + fail_ops: fail_ops.iter().copied().collect(), + calls: calls.clone(), + app_id: Uuid::new_v4(), + }; + (mock, calls) + } + + fn record(&self, name: &'static str) { + self.calls.lock().unwrap().push(name); + } + + fn result(&self, name: &'static str) -> anyhow::Result<()> { + self.record(name); + if self.fail_ops.contains(name) { + Err(anyhow::anyhow!("simulated failure in {name}")) + } else { + Ok(()) + } + } + } + + #[async_trait] + impl CleanupClient for MockCleanupClient { + async fn list_env_components(&self, _: &Uuid) -> anyhow::Result> { + self.record("list_env_components"); + if self.fail_ops.contains("list_env_components") { + Err(anyhow::anyhow!("simulated failure")) + } else { + Ok(vec![(Uuid::new_v4(), 0)]) + } + } + + async fn delete_component(&self, _: &Uuid, _: u64) -> anyhow::Result<()> { + self.result("delete_component") + } + + async fn list_env_domain_registrations(&self, _: &Uuid) -> anyhow::Result> { + self.record("list_env_domain_registrations"); + if self.fail_ops.contains("list_env_domain_registrations") { + Err(anyhow::anyhow!("simulated failure")) + } else { + Ok(vec![Uuid::new_v4()]) + } + } + + async fn delete_domain_registration(&self, _: &Uuid) -> anyhow::Result<()> { + self.result("delete_domain_registration") + } + + async fn get_env_app_id_and_revision(&self, _: &Uuid) -> anyhow::Result<(Uuid, u64)> { + self.record("get_env_app_id_and_revision"); + if self.fail_ops.contains("get_env_app_id_and_revision") { + Err(anyhow::anyhow!("simulated failure")) + } else { + Ok((self.app_id, 1)) + } + } + + async fn delete_environment(&self, _: &Uuid, _: u64) -> anyhow::Result<()> { + self.result("delete_environment") + } + + async fn get_application_revision(&self, _: &Uuid) -> anyhow::Result { + self.record("get_application_revision"); + if self.fail_ops.contains("get_application_revision") { + Err(anyhow::anyhow!("simulated failure")) + } else { + Ok(1) + } + } + + async fn delete_application(&self, _: &Uuid, _: u64) -> anyhow::Result<()> { + self.result("delete_application") + } + + async fn get_account_revision(&self, _: &Uuid) -> anyhow::Result { + self.record("get_account_revision"); + if self.fail_ops.contains("get_account_revision") { + Err(anyhow::anyhow!("simulated failure")) + } else { + Ok(1) + } + } + + async fn delete_account(&self, _: &Uuid, _: u64) -> anyhow::Result<()> { + self.result("delete_account") + } + } + + // ── Test helpers ────────────────────────────────────────────────────────── + + fn all_ops() -> Vec<&'static str> { + vec![ + "list_env_components", + "delete_component", + "list_env_domain_registrations", + "delete_domain_registration", + "get_env_app_id_and_revision", + "delete_environment", + "get_application_revision", + "delete_application", + "get_account_revision", + "delete_account", + ] + } + + fn run(mock: &MockCleanupClient) { + let env_id = Uuid::new_v4(); + let account_id = Uuid::new_v4(); + block_on(async { + cleanup_env_and_app_with(mock, &env_id).await; + cleanup_account_with(mock, &account_id).await; + }); + } + + fn contains(calls: &[&str], op: &str) -> bool { + calls.contains(&op) + } + + // ── Tests ───────────────────────────────────────────────────────────────── + + #[test] + fn all_steps_run_on_success() { + let (mock, calls) = MockCleanupClient::new(&[]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + for op in all_ops() { + assert!( + contains(&calls, op), + "expected '{op}' to be called; got: {calls:?}" + ); + } + } + + #[test] + fn step1_list_failure_continues() { + let (mock, calls) = MockCleanupClient::new(&["list_env_components"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!( + contains(&calls, "list_env_domain_registrations"), + "{calls:?}" + ); + assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}"); + assert!(contains(&calls, "get_account_revision"), "{calls:?}"); + } + + #[test] + fn step2_list_failure_continues() { + let (mock, calls) = MockCleanupClient::new(&["list_env_domain_registrations"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}"); + assert!(contains(&calls, "get_account_revision"), "{calls:?}"); + } + + /// `get_env_app_id_and_revision` (step 3 get) fails → step 4 is skipped + /// (no app_id available) but step 5 still runs. + #[test] + fn step3_get_failure_skips_step4_runs_step5() { + let (mock, calls) = MockCleanupClient::new(&["get_env_app_id_and_revision"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!( + !contains(&calls, "get_application_revision"), + "step 4 must be skipped when step 3 get fails; got: {calls:?}" + ); + assert!( + contains(&calls, "get_account_revision"), + "step 5 must still run; got: {calls:?}" + ); + } + + /// `delete_environment` fails but get succeeded, so app_id is available: + /// step 4 and step 5 both run. + #[test] + fn step3_delete_failure_still_runs_step4_and_step5() { + let (mock, calls) = MockCleanupClient::new(&["delete_environment"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!(contains(&calls, "get_application_revision"), "{calls:?}"); + assert!(contains(&calls, "get_account_revision"), "{calls:?}"); + } + + #[test] + fn step4_failure_continues_to_step5() { + let (mock, calls) = MockCleanupClient::new(&["get_application_revision"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!( + contains(&calls, "get_account_revision"), + "step 5 should run after step 4 failure; got: {calls:?}" + ); + } + + /// `get_account_revision` (step 5 get) fails → function completes without + /// panic and `delete_account` is not attempted. + #[test] + fn step5_get_failure_no_delete_and_completes() { + let (mock, calls) = MockCleanupClient::new(&["get_account_revision"]); + run(&mock); + let calls = calls.lock().unwrap().clone(); + assert!(contains(&calls, "get_account_revision"), "{calls:?}"); + assert!( + !contains(&calls, "delete_account"), + "delete_account must not run when get fails; got: {calls:?}" + ); + } + + /// All steps fail simultaneously — function completes without panic and + /// every unconditional step is attempted. + #[test] + fn all_steps_fail_no_short_circuit() { + let (mock, calls) = MockCleanupClient::new(&all_ops()); + run(&mock); // must not panic + let calls = calls.lock().unwrap().clone(); + assert!(contains(&calls, "list_env_components"), "{calls:?}"); + assert!( + contains(&calls, "list_env_domain_registrations"), + "{calls:?}" + ); + assert!(contains(&calls, "get_env_app_id_and_revision"), "{calls:?}"); + assert!(contains(&calls, "get_account_revision"), "{calls:?}"); + } +} diff --git a/integration-tests/src/benchmarks/cold_start_unknown.rs b/integration-tests/src/benchmarks/cold_start_unknown.rs index f29f297658..592b80e2e4 100644 --- a/integration-tests/src/benchmarks/cold_start_unknown.rs +++ b/integration-tests/src/benchmarks/cold_start_unknown.rs @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::benchmarks::{delete_workers, invoke_and_await_agent}; +use crate::benchmarks::{ + cleanup_account, cleanup_env_and_app, delete_workers, invoke_and_await_agent, +}; use async_trait::async_trait; use futures_concurrency::future::Join; use golem_common::base_model::agent::ParsedAgentId; use golem_common::model::AgentId; use golem_common::model::component::ComponentDto; +use golem_common::model::environment::EnvironmentId; use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig}; use golem_test_framework::config::benchmark::TestMode; @@ -196,6 +199,8 @@ impl Benchmark for ColdStartUnknownMedium { pub struct IterationContext { user: TestUserContext, agents: Vec<(ComponentDto, ParsedAgentId)>, + /// One env_id per size (cold_start creates one env per component). + env_ids: Vec, } pub struct ColdStartUnknownBenchmark { @@ -235,11 +240,13 @@ impl ColdStartUnknownBenchmark { pub async fn setup_iteration(&self, config: &RunConfig) -> IterationContext { let user = self.deps.user().await.unwrap(); let mut agents = vec![]; + let mut env_ids = vec![]; for _ in 0..config.size { // Agent types names are unique within one environment, // so make sure each component get its own env let (_, env) = user.app_and_env().await.unwrap(); + env_ids.push(env.id); let component = user .component(&env.id, &self.component_name) @@ -252,7 +259,11 @@ impl ColdStartUnknownBenchmark { agents.push((component, agent_id)); } - IterationContext { user, agents } + IterationContext { + user, + agents, + env_ids, + } } pub async fn warmup(&self, config: &RunConfig) { @@ -298,6 +309,14 @@ impl ColdStartUnknownBenchmark { .iter() .filter_map(|(component, agent_id)| AgentId::from_agent_id(component.id, agent_id).ok()) .collect(); - delete_workers(&iteration.user, &agent_ids).await + delete_workers(&iteration.user, &agent_ids).await; + // Clean up each env/app individually, then delete the account once. + // This avoids the account being deleted on the first env cleanup and + // causing subsequent cleanup calls to fail (since the user token would + // be invalid after account deletion). + for env_id in &iteration.env_ids { + cleanup_env_and_app(&iteration.user, env_id).await; + } + cleanup_account(&iteration.user).await; } } diff --git a/integration-tests/src/benchmarks/durability_overhead.rs b/integration-tests/src/benchmarks/durability_overhead.rs index f956eb3636..fb864fd44c 100644 --- a/integration-tests/src/benchmarks/durability_overhead.rs +++ b/integration-tests/src/benchmarks/durability_overhead.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::benchmarks::{delete_workers, invoke_and_await_agent}; +use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent}; use async_trait::async_trait; use futures_concurrency::future::Join; use golem_common::base_model::agent::ParsedAgentId; use golem_common::model::AgentId; use golem_common::model::component::{ComponentDto, ComponentId}; +use golem_common::model::environment::EnvironmentId; use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig}; use golem_test_framework::config::benchmark::TestMode; @@ -42,6 +43,7 @@ pub struct DurabilityOverheadIterationContext { durable_nonpersistent_agent_ids: Vec, ephemeral_agent_ids: Vec, durable_persistent_commit_agent_ids: Vec, + env_id: EnvironmentId, } fn agent_ids_to_agent_ids(component_id: ComponentId, agent_ids: &[ParsedAgentId]) -> Vec { @@ -146,6 +148,7 @@ impl Benchmark for DurabilityOverhead { durable_nonpersistent_agent_ids, ephemeral_agent_ids, durable_persistent_commit_agent_ids, + env_id: env.id, } } @@ -336,5 +339,6 @@ impl Benchmark for DurabilityOverhead { ), ) .await; + cleanup_user_state(&context.user, &context.env_id).await; } } diff --git a/integration-tests/src/benchmarks/latency.rs b/integration-tests/src/benchmarks/latency.rs index a44ff42333..006d29f228 100644 --- a/integration-tests/src/benchmarks/latency.rs +++ b/integration-tests/src/benchmarks/latency.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::benchmarks::{delete_workers, invoke_and_await_agent}; +use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent}; use async_trait::async_trait; use futures_concurrency::future::Join; use golem_common::base_model::agent::ParsedAgentId; use golem_common::model::AgentId; use golem_common::model::component::ComponentDto; +use golem_common::model::environment::EnvironmentId; use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig}; use golem_test_framework::config::benchmark::TestMode; @@ -200,6 +201,7 @@ pub struct IterationContext { component: ComponentDto, agent_ids: Vec, length: usize, + env_id: EnvironmentId, } pub struct LatencyBenchmark { @@ -261,6 +263,7 @@ impl LatencyBenchmark { component, agent_ids, length: config.length, + env_id: env.id, } } @@ -326,6 +329,7 @@ impl LatencyBenchmark { .iter() .filter_map(|agent_id| AgentId::from_agent_id(iteration.component.id, agent_id).ok()) .collect(); - delete_workers(&iteration.user, &agent_ids).await + delete_workers(&iteration.user, &agent_ids).await; + cleanup_user_state(&iteration.user, &iteration.env_id).await; } } diff --git a/integration-tests/src/benchmarks/mod.rs b/integration-tests/src/benchmarks/mod.rs index b15dde89a3..0682055643 100644 --- a/integration-tests/src/benchmarks/mod.rs +++ b/integration-tests/src/benchmarks/mod.rs @@ -29,15 +29,20 @@ use std::time::{Duration, SystemTime}; use tracing::{Instrument, info, warn}; use tracing_opentelemetry::OpenTelemetrySpanExt; +pub mod cleanup; pub mod cold_start_unknown; pub mod durability_overhead; pub mod latency; pub mod sleep; pub mod throughput; +pub mod throughput_saturation; -/// Injects the current tracing span's OpenTelemetry trace context (traceparent/tracestate) -/// into a reqwest Request's headers so that downstream services can link their -/// spans to the benchmark's trace. +// Re-export cleanup helpers so callers can use the flat `benchmarks::*` path. +pub use cleanup::{cleanup_account, cleanup_env_and_app, cleanup_user_state}; + +/// Injects the current tracing span's OpenTelemetry trace context +/// (traceparent/tracestate) into a reqwest Request's headers so that +/// downstream services can link their spans to the benchmark's trace. fn inject_trace_context(request: &mut Request) { let current_span = tracing::Span::current(); let otel_context = current_span.context(); diff --git a/integration-tests/src/benchmarks/sleep.rs b/integration-tests/src/benchmarks/sleep.rs index 97bb64e16f..457872ed29 100644 --- a/integration-tests/src/benchmarks/sleep.rs +++ b/integration-tests/src/benchmarks/sleep.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::benchmarks::delete_workers; +use crate::benchmarks::{cleanup_user_state, delete_workers}; use async_trait::async_trait; use futures_concurrency::future::Join; use golem_common::base_model::agent::ParsedAgentId; use golem_common::model::AgentId; use golem_common::model::component::ComponentDto; +use golem_common::model::environment::EnvironmentId; use golem_common::{agent_id, data_value}; use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig}; use golem_test_framework::config::benchmark::TestMode; @@ -39,6 +40,7 @@ pub struct SleepIterationContext { user: TestUserContext, component: ComponentDto, agent_ids: Vec, + env_id: EnvironmentId, } #[async_trait] @@ -111,6 +113,7 @@ impl Benchmark for Sleep { user, component, agent_ids, + env_id: env.id, } } @@ -184,6 +187,7 @@ impl Benchmark for Sleep { .iter() .filter_map(|agent_id| AgentId::from_agent_id(context.component.id, agent_id).ok()) .collect(); - delete_workers(&context.user, &agent_ids).await + delete_workers(&context.user, &agent_ids).await; + cleanup_user_state(&context.user, &context.env_id).await; } } diff --git a/integration-tests/src/benchmarks/throughput.rs b/integration-tests/src/benchmarks/throughput.rs index 9cdecd7a1f..f3552e0eee 100644 --- a/integration-tests/src/benchmarks/throughput.rs +++ b/integration-tests/src/benchmarks/throughput.rs @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::benchmarks::{delete_workers, invoke_and_await_agent, invoke_and_await_http}; +use crate::benchmarks::{ + cleanup_user_state, delete_workers, invoke_and_await_agent, invoke_and_await_http, +}; use async_trait::async_trait; use axum::http::{HeaderMap, HeaderValue}; use futures_concurrency::future::Join; @@ -21,12 +23,13 @@ use golem_common::base_model::agent::{DataValue, ParsedAgentId}; use golem_common::model::agent::AgentTypeName; use golem_common::model::component::{ComponentDto, ComponentId}; use golem_common::model::domain_registration::{Domain, DomainRegistrationCreation}; +use golem_common::model::environment::EnvironmentId; use golem_common::model::http_api_deployment::{ HttpApiDeploymentAgentOptions, HttpApiDeploymentCreation, }; use golem_common::model::{AgentId, RoutingTable}; use golem_common::{agent_id, data_value}; -use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, RunConfig}; +use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, ResultKey, RunConfig}; use golem_test_framework::config::benchmark::TestMode; use golem_test_framework::config::dsl_impl::TestUserContext; use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies}; @@ -35,6 +38,7 @@ use indoc::indoc; use reqwest::{Body, Method, Request, Url}; use serde_json::json; use std::collections::BTreeMap; +use std::time::Instant; use tracing::{Instrument, Level, info}; pub struct ThroughputEcho { @@ -79,16 +83,14 @@ impl Benchmark for ThroughputEcho { "echo", "echo", Box::new(|_| data_value!("benchmark")), - Box::new(|port, idx, _length| { - let url = Url::parse(&format!( - "http://localhost:{port}/test-{idx}-http/echo/test-message" - )) - .unwrap(); + Box::new(|base_url, idx, _length| { + let url = + Url::parse(&format!("{base_url}/test-{idx}-http/echo/test-message")).unwrap(); Request::new(Method::POST, url) }), - Box::new(|port, idx, _length| { + Box::new(|base_url, idx, _length| { let url = Url::parse(&format!( - "http://localhost:{port}/rust/test-{idx}-http/echo/test-message" + "{base_url}/rust/test-{idx}-http/echo/test-message" )) .unwrap(); Request::new(Method::POST, url) @@ -179,21 +181,16 @@ impl Benchmark for ThroughputLargeInput { let bytes = vec![0u8; length]; data_value!(bytes) }), - Box::new(|port, idx, length| { - let url = Url::parse(&format!( - "http://localhost:{port}/test-{idx}-http/large-input" - )) - .unwrap(); + Box::new(|base_url, idx, length| { + let url = Url::parse(&format!("{base_url}/test-{idx}-http/large-input")).unwrap(); let json_body = json!({"input": vec![0u8; length]}).to_string(); let mut request = Request::new(Method::POST, url); *request.body_mut() = Some(Body::wrap(json_body)); request }), - Box::new(|port, idx, length| { - let url = Url::parse(&format!( - "http://localhost:{port}/rust/test-{idx}-http/large-input" - )) - .unwrap(); + Box::new(|base_url, idx, length| { + let url = + Url::parse(&format!("{base_url}/rust/test-{idx}-http/large-input")).unwrap(); let json_body = json!({"input": vec![0u8; length]}).to_string(); let mut request = Request::new(Method::POST, url); *request.body_mut() = Some(Body::wrap(json_body)); @@ -282,21 +279,16 @@ impl Benchmark for ThroughputCpuIntensive { "cpu_intensive", "cpuIntensive", Box::new(|length| data_value!(length as f64)), - Box::new(|port, idx, length| { - let url = Url::parse(&format!( - "http://localhost:{port}/test-{idx}-http/cpu-intensive" - )) - .unwrap(); + Box::new(|base_url, idx, length| { + let url = Url::parse(&format!("{base_url}/test-{idx}-http/cpu-intensive")).unwrap(); let json_body = json!({"length": length}).to_string(); let mut request = Request::new(Method::POST, url); *request.body_mut() = Some(Body::wrap(json_body)); request }), - Box::new(|port, idx, length| { - let url = Url::parse(&format!( - "http://localhost:{port}/rust/test-{idx}-http/cpu-intensive" - )) - .unwrap(); + Box::new(|base_url, idx, length| { + let url = + Url::parse(&format!("{base_url}/rust/test-{idx}-http/cpu-intensive")).unwrap(); let json_body = json!({"length": length}).to_string(); let mut request = Request::new(Method::POST, url); *request.body_mut() = Some(Body::wrap(json_body)); @@ -402,14 +394,20 @@ impl AgentInvocationTarget { } } - pub fn prefix(&self, prefix: &str, routing_table: &RoutingTable) -> String { + pub fn prefix(&self, prefix: &str, routing_table: &Option) -> String { match self { AgentInvocationTarget::Single { .. } => prefix.to_string(), AgentInvocationTarget::Pair { pair, .. } => { - if pair.at_same_worker_executor(routing_table) { - format!("{prefix}local-") + if let Some(rt) = routing_table { + if pair.at_same_worker_executor(rt) { + format!("{prefix}local-") + } else { + format!("{prefix}remote-") + } } else { - format!("{prefix}remote-") + // Routing table not available (no shard-manager port-forward + // configured); all RPC pairs go into a single unlabeled bucket. + prefix.to_string() } } } @@ -426,19 +424,35 @@ pub struct IterationContext { rust_agent_ids_for_http: Vec, ts_agent_ids_for_http: Vec, length: usize, - routing_table: RoutingTable, + /// `None` when shard-manager host/port are not configured (cloud mode + /// without port-forward). When `None`, RPC pairs go into a single unlabeled + /// bucket instead of being split into local/remote. + routing_table: Option, ts_rpc_agent_id_pairs: Vec, rust_rpc_agent_id_pairs: Vec, + env_id: EnvironmentId, } +/// Type for HTTP request builder closures used by the throughput benchmark. +/// Receives `(base_url, agent_index, length)` where `base_url` is the full +/// scheme+host+port prefix (e.g. `http://localhost:8084` in local mode or +/// `https://myenv.apps.golem.dev` in cloud mode). +type HttpRequestFn = Box Fn(&'a str, usize, usize) -> Request + Send + Sync + 'static>; + pub struct ThroughputBenchmark { rust_method_name: String, ts_method_name: String, agent_params: Box DataValue + Send + Sync + 'static>, - http_request: Box Request + Send + Sync + 'static>, - rust_http_request: Box Request + Send + Sync + 'static>, + http_request: HttpRequestFn, + rust_http_request: HttpRequestFn, deps: BenchmarkTestDependencies, call_count: usize, + /// Pre-built HTTP client for cloud-mode apps-domain calls + /// (`https://{env_id}.{apps_base_domain}`). Cached here so the + /// connection pool is warm across benchmark iterations. + /// `None` in local/provided mode (client is built per-iteration from the + /// custom-request port with a Host header override). + cloud_http_client: Option, } fn agent_ids_to_agent_ids(component_id: ComponentId, ids: &[ParsedAgentId]) -> Vec { @@ -447,13 +461,38 @@ fn agent_ids_to_agent_ids(component_id: ComponentId, ids: &[ParsedAgentId]) -> V .collect() } +/// Records aggregate throughput (invocations per second) for a measurement +/// block as a `count` result under the key `{prefix}throughput-ops-per-sec`. +/// +/// `total_calls` is the total number of invocations issued across all targets +/// in the block; `elapsed` is the wall-clock duration of the concurrently +/// executed block. Throughput is therefore the realised aggregate rate the +/// cluster sustained for this implementation, not a per-call latency. +fn record_throughput( + recorder: &BenchmarkRecorder, + prefix: &str, + total_calls: usize, + elapsed: std::time::Duration, +) { + let secs = elapsed.as_secs_f64(); + if secs <= 0.0 || total_calls == 0 { + return; + } + let ops_per_sec = (total_calls as f64 / secs).round() as u64; + info!("{prefix}throughput: {total_calls} calls in {secs:.3}s = {ops_per_sec} ops/sec"); + recorder.count( + &ResultKey::primary(format!("{prefix}throughput-ops-per-sec")), + ops_per_sec, + ); +} + impl ThroughputBenchmark { pub async fn new( rust_method_name: &str, ts_method_name: &str, agent_params: Box DataValue + Send + Sync + 'static>, - http_request: Box Request + Send + Sync + 'static>, - rust_http_request: Box Request + Send + Sync + 'static>, + http_request: HttpRequestFn, + rust_http_request: HttpRequestFn, mode: &TestMode, verbosity: Level, cluster_size: usize, @@ -461,21 +500,40 @@ impl ThroughputBenchmark { call_count: usize, otlp: bool, ) -> Self { + let deps = BenchmarkTestDependencies::new( + mode, + verbosity, + cluster_size, + disable_compilation_cache, + otlp, + ) + .await; + + // Build the cloud HTTP client once so the connection pool stays alive + // across all benchmark iterations. In cloud mode requests go to + // https://{env_id}.{apps_base_domain}, so we use standard TLS with + // ALPN negotiation — NOT http2_prior_knowledge() which is for h2c + // (cleartext HTTP/2) and would bypass the ALPN step that the NLB + // terminating TLS expects. + let cloud_http_client = deps.apps_base_domain().map(|_| { + reqwest::ClientBuilder::new() + .pool_max_idle_per_host(1024) + .pool_idle_timeout(std::time::Duration::from_secs(90)) + .tcp_nodelay(true) + .timeout(std::time::Duration::from_secs(180)) + .build() + .expect("Failed to build cloud HTTP client for throughput benchmark") + }); + Self { rust_method_name: rust_method_name.to_string(), ts_method_name: ts_method_name.to_string(), agent_params, http_request, rust_http_request, - deps: BenchmarkTestDependencies::new( - mode, - verbosity, - cluster_size, - disable_compilation_cache, - otlp, - ) - .await, + deps, call_count, + cloud_http_client, } } @@ -491,13 +549,23 @@ impl ThroughputBenchmark { let mut ts_rpc_agent_id_pairs = vec![]; let mut rust_rpc_agent_id_pairs = vec![]; - let routing_table = self - .deps - .shard_manager() - .get_routing_table() - .await - .expect("Failed to get routing table"); - info!("Fetched routing table: {routing_table}"); + // Fetch routing table when shard-manager is configured; fall back to + // None (unlabeled single-bucket RPC) when not configured (e.g. cloud + // mode without a port-forward to the shard-manager). + let routing_table: Option = + match self.deps.shard_manager().get_routing_table().await { + Ok(rt) => { + info!("Fetched routing table: {rt}"); + Some(rt) + } + Err(err) => { + info!( + "Shard-manager not available, skipping routing table (RPC pairs \ + will be unlabeled): {err:#}" + ); + None + } + }; let user = self.deps.user().await.unwrap(); let (_, env) = user.app_and_env().await.unwrap(); @@ -542,7 +610,14 @@ impl ThroughputBenchmark { let client = user.registry_service_client().await; - let domain = Domain(format!("{}.golem.cloud", env.id)); + // In cloud mode, use the configured apps_base_domain. Fall back to + // "golem.cloud" for local/provided modes. + let apps_base_domain = self + .deps + .apps_base_domain() + .unwrap_or("golem.cloud") + .to_string(); + let domain = Domain(format!("{}.{}", env.id, apps_base_domain)); async { client @@ -605,6 +680,7 @@ impl ThroughputBenchmark { routing_table, ts_rpc_agent_id_pairs, rust_rpc_agent_id_pairs, + env_id: env.id, } } @@ -713,7 +789,7 @@ impl ThroughputBenchmark { pub async fn run(&self, iteration: &IterationContext, recorder: BenchmarkRecorder) { async fn measure_agents( user: &TestUserContext, - routing_table: &RoutingTable, + routing_table: &Option, recorder: &BenchmarkRecorder, length: usize, call_count: usize, @@ -746,7 +822,10 @@ impl ThroughputBenchmark { }) .collect::>(); + let started = Instant::now(); let results = result_futures.join().await; + let elapsed = started.elapsed(); + record_throughput(recorder, prefix, targets.len() * call_count, elapsed); for (idx, (results, target)) in results.iter().zip(targets).enumerate() { let prefix = target.prefix(prefix, routing_table); for result in results { @@ -799,31 +878,51 @@ impl ThroughputBenchmark { .instrument(tracing::info_span!("measure_ts_agents")) .await; - let port = self.deps.worker_service().custom_request_port(); - - let client = { - let mut headers = HeaderMap::new(); - headers.insert("Host", HeaderValue::from_str(&iteration.domain.0).unwrap()); - reqwest::Client::builder() - .default_headers(headers) - .build() - .expect("Failed to create HTTP client") - }; + // Resolve the base URL prefix and HTTP client for the code-first HTTP + // API benchmark paths. The request-builder closures append the route + // path (e.g. "/test-0-http/echo/...") to this prefix. + // + // cloud mode: base = "https://{env_id}.apps.dev.golem.cloud" + // → reqwest connects directly to that host (TLS/SNI + + // Host set from the URL); the apps gateway routes it + // to worker-service. Uses the cached, pool-warm client. + // + // local mode: base = "http://localhost:{custom_request_port}" + // → reqwest connects to localhost; an explicit Host + // header ("{env_id}.golem.cloud") tells the local + // worker-service which deployment to route to. + let (http_base_url, client): (String, reqwest::Client) = + if let Some(ref cached) = self.cloud_http_client { + let base = format!("https://{}", iteration.domain.0); + (base, cached.clone()) + } else { + let port = self.deps.worker_service().custom_request_port(); + let base = format!("http://localhost:{port}"); + let mut headers = HeaderMap::new(); + headers.insert("Host", HeaderValue::from_str(&iteration.domain.0).unwrap()); + let c = reqwest::Client::builder() + .default_headers(headers) + .build() + .expect("Failed to create HTTP client"); + (base, c) + }; async { let client = client.clone(); + let base = http_base_url.clone(); let result_futures = iteration .rust_agent_ids_for_http .iter() .enumerate() .map(move |(idx, _agent_id)| { let client = client.clone(); + let base = base.clone(); async move { let mut results = vec![]; for _ in 0..self.call_count { results.push( invoke_and_await_http(client.clone(), || { - (self.rust_http_request)(port, idx, iteration.length) + (self.rust_http_request)(&base, idx, iteration.length) }) .await, ) @@ -833,7 +932,15 @@ impl ThroughputBenchmark { }) .collect::>(); + let started = Instant::now(); let results = result_futures.join().await; + let elapsed = started.elapsed(); + record_throughput( + &recorder, + "rust-agent-http-", + iteration.rust_agent_ids_for_http.len() * self.call_count, + elapsed, + ); for (idx, results) in results.iter().enumerate() { for result in results { result.record(&recorder, "rust-agent-http-", idx.to_string().as_str()); @@ -850,12 +957,13 @@ impl ThroughputBenchmark { .enumerate() .map(move |(idx, _agent_id)| { let client = client.clone(); + let base = http_base_url.clone(); async move { let mut results = vec![]; for _ in 0..self.call_count { results.push( invoke_and_await_http(client.clone(), || { - (self.http_request)(port, idx, iteration.length) + (self.http_request)(&base, idx, iteration.length) }) .await, ) @@ -865,7 +973,15 @@ impl ThroughputBenchmark { }) .collect::>(); + let started = Instant::now(); let results = result_futures.join().await; + let elapsed = started.elapsed(); + record_throughput( + &recorder, + "ts-agent-http-", + iteration.ts_agent_ids_for_http.len() * self.call_count, + elapsed, + ); for (idx, results) in results.iter().enumerate() { for result in results { result.record(&recorder, "ts-agent-http-", idx.to_string().as_str()); @@ -969,5 +1085,6 @@ impl ThroughputBenchmark { } } delete_workers(&iteration.user, &rust_rpc_workers).await; + cleanup_user_state(&iteration.user, &iteration.env_id).await; } } diff --git a/integration-tests/src/benchmarks/throughput_saturation.rs b/integration-tests/src/benchmarks/throughput_saturation.rs new file mode 100644 index 0000000000..768d8c7eb1 --- /dev/null +++ b/integration-tests/src/benchmarks/throughput_saturation.rs @@ -0,0 +1,425 @@ +// Copyright 2024-2026 Golem Cloud +// +// Licensed under the Golem Source License v1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://license.golem.cloud/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Throughput-under-memory-saturation benchmarks. +//! +//! Unlike the regular throughput benchmark — which keeps `size` small enough +//! that all workers fit comfortably in memory — these benchmarks deliberately +//! ramp the number of *active* agents up to and past the executor's memory +//! ceiling, to find the knee: the agent count where the pod can still keep +//! everything resident (latency flat, throughput scaling linearly) just before +//! it starts evicting and replaying (latency spikes, throughput craters). +//! +//! The measured `run` phase drives sustained load over a fixed window: each +//! agent repeatedly does a short unit of work then goes idle for [`IDLE_GAP`]. +//! During that gap the agent has no in-flight work and becomes a `LoadedIdle` +//! eviction candidate, so under memory pressure it can be evicted and then must +//! reload (oplog replay + re-admission) on its next call — the churn that makes +//! throughput crater past the knee. Starts are staggered so the fleet is not +//! synchronised. +//! +//! Three variants: +//! - `throughput-saturation-counters`: agent-counters with a synthetic, +//! per-agent-distinct retained footprint (`allocate_memory`) plus CPU work +//! (`busy_for`). The footprint is controllable via `length`. +//! - `throughput-saturation-echo-rust` / `throughput-saturation-echo-ts`: the +//! benchmark `echo` agent (Rust / TS) called repeatedly. No synthetic +//! footprint — the per-agent memory is the agent's natural footprint, which +//! for the TS agent includes the QuickJS runtime. Answers "how many actively +//! invoked echo agents fit per pod". +//! +//! Parameters: +//! - `size` = number of active agents in this step (the ramp axis). +//! - `length` = for the counters variant, the base per-agent memory footprint in +//! bytes (agent `i` retains a deterministic multiple); ignored by the echo +//! variants. + +use crate::benchmarks::{cleanup_user_state, delete_workers, invoke_and_await_agent}; +use async_trait::async_trait; +use futures_concurrency::future::Join; +use golem_common::base_model::agent::{DataValue, ParsedAgentId}; +use golem_common::model::AgentId; +use golem_common::model::component::ComponentDto; +use golem_common::model::environment::EnvironmentId; +use golem_common::{agent_id, data_value}; +use golem_test_framework::benchmark::{Benchmark, BenchmarkRecorder, ResultKey, RunConfig}; +use golem_test_framework::config::benchmark::TestMode; +use golem_test_framework::config::dsl_impl::TestUserContext; +use golem_test_framework::config::{BenchmarkTestDependencies, TestDependencies}; +use golem_test_framework::dsl::{TestDsl, TestDslExtended}; +use indoc::indoc; +use std::time::{Duration, Instant}; +use tracing::{Instrument, Level, info}; + +/// Number of distinct footprint buckets the synthetic per-agent memory spread +/// cycles through, so the fleet holds a mix of sizes rather than a uniform +/// amount. +const SPREAD_BUCKETS: usize = 8; + +/// CPU busy time (ms) per `busy_for` invocation (counters variant only). +const BUSY_MILLIS: u32 = 50; + +/// Idle gap each agent sleeps between calls. During this gap the agent has no +/// in-flight work and becomes a `LoadedIdle` eviction candidate. Under memory +/// pressure it may be evicted and then must reload on its next call — the churn +/// this benchmark exists to measure. +const IDLE_GAP: Duration = Duration::from_millis(200); + +/// Total measured wall-clock duration of the sustained-load phase. Throughput +/// and churn are measured over this fixed window so steps with different `size` +/// are comparable. Held long enough that the high-residency plateau persists for +/// at least a minute, so steady-state behaviour at the memory ceiling (not just +/// the initial burst) is observed. +const RUN_DURATION: Duration = Duration::from_secs(90); + +/// Maximum per-agent start stagger, so the fleet is not synchronised: at any +/// instant some agents are mid-call (demanding memory) while others sit idle +/// (evictable). +const MAX_STAGGER: Duration = Duration::from_millis(250); + +/// Resident memory (bytes) the synthetic-footprint agent `index` retains for a +/// given `base`. Spreads deterministically across [`SPREAD_BUCKETS`] buckets +/// (`base * 1` .. `base * SPREAD_BUCKETS`) so different agents hold different +/// amounts and some sit much closer to the limit than others. +fn agent_memory_bytes(index: usize, base: usize) -> u32 { + let bucket = (index % SPREAD_BUCKETS) + 1; + (base.saturating_mul(bucket)).min(u32::MAX as usize) as u32 +} + +/// Per-agent start offset derived deterministically from the index, spread +/// across `[0, MAX_STAGGER)`. +fn agent_stagger(index: usize) -> Duration { + let frac = (index as u32).wrapping_mul(2_654_435_761) % 1000; + MAX_STAGGER.checked_mul(frac).unwrap_or_default() / 1000 +} + +/// Describes one saturation variant: which component to load, which agent type +/// and method to actively invoke, and whether to pre-load a synthetic footprint. +struct SaturationVariant { + /// WASM file name (without `.wasm`) in the component directory. + wasm_name: &'static str, + /// Registry display name for the component. + component_name: &'static str, + /// Agent type to instantiate. + agent_type: &'static str, + /// Method invoked repeatedly during the measured phase. + active_method: &'static str, + /// Builds the parameter for one `active_method` call. + active_params: fn() -> DataValue, + /// When set, each agent calls this method once in warmup with its + /// deterministic footprint (`allocate_memory`-style). `None` for the echo + /// variants, whose footprint is the agent's natural memory. + allocate_method: Option<&'static str>, +} + +const COUNTERS_VARIANT: SaturationVariant = SaturationVariant { + wasm_name: "it_agent_counters_release", + component_name: "it:agent-counters", + agent_type: "Counter", + active_method: "busy_for", + active_params: || data_value!(BUSY_MILLIS), + allocate_method: Some("allocate_memory"), +}; + +const ECHO_RUST_VARIANT: SaturationVariant = SaturationVariant { + wasm_name: "benchmark_agent_rust_release", + component_name: "benchmark:agent-rust", + agent_type: "RustBenchmarkAgent", + active_method: "echo", + active_params: || data_value!("saturation"), + allocate_method: None, +}; + +const ECHO_TS_VARIANT: SaturationVariant = SaturationVariant { + wasm_name: "benchmark_agent_ts", + component_name: "benchmark:agent-ts", + agent_type: "BenchmarkAgent", + active_method: "echo", + active_params: || data_value!("saturation"), + allocate_method: None, +}; + +pub struct SaturationBenchmarkContext { + deps: BenchmarkTestDependencies, +} + +pub struct SaturationIterationContext { + user: TestUserContext, + component: ComponentDto, + agent_ids: Vec, + base_memory_bytes: usize, + env_id: EnvironmentId, +} + +/// Shared implementation for all saturation variants. The variant-specific +/// config is supplied by the wrapper types' `variant()`. +async fn create_context( + mode: &TestMode, + verbosity: Level, + cluster_size: usize, + disable_compilation_cache: bool, + otlp: bool, +) -> SaturationBenchmarkContext { + SaturationBenchmarkContext { + deps: BenchmarkTestDependencies::new( + mode, + verbosity, + cluster_size, + disable_compilation_cache, + otlp, + ) + .await, + } +} + +async fn setup_iteration( + variant: &SaturationVariant, + config: &RunConfig, + benchmark_context: &SaturationBenchmarkContext, +) -> SaturationIterationContext { + let user = benchmark_context.deps.user().await.unwrap(); + let (_, env) = user.app_and_env().await.unwrap(); + + info!("Registering component {}", variant.component_name); + let component = user + .component(&env.id, variant.wasm_name) + .name(variant.component_name) + .store() + .await + .unwrap(); + + let mut agent_ids = vec![]; + for n in 0..config.size { + agent_ids.push(agent_id!(variant.agent_type, format!("saturation-{n}"))); + } + + SaturationIterationContext { + user, + component, + agent_ids, + base_memory_bytes: config.length, + env_id: env.id, + } +} + +async fn warmup(variant: &SaturationVariant, context: &SaturationIterationContext) { + let Some(allocate_method) = variant.allocate_method else { + // Echo variants: nothing to pre-load; the agent's natural footprint is + // established on first invocation. + return; + }; + + async { + let base = context.base_memory_bytes; + let result_futures = context + .agent_ids + .iter() + .enumerate() + .map(move |(idx, agent_id)| async move { + let user_clone = context.user.clone(); + let bytes = agent_memory_bytes(idx, base); + invoke_and_await_agent( + &user_clone, + &context.component, + agent_id, + allocate_method, + data_value!(bytes), + ) + .await + }) + .collect::>(); + let _ = result_futures.join().await; + } + .instrument(tracing::info_span!( + "warmup_allocate_memory", + agent_count = context.agent_ids.len() + )) + .await; +} + +async fn run( + variant: &SaturationVariant, + context: &SaturationIterationContext, + recorder: BenchmarkRecorder, +) { + let agent_count = context.agent_ids.len(); + let deadline = Instant::now() + RUN_DURATION; + + let result_futures = context + .agent_ids + .iter() + .enumerate() + .map(|(idx, agent_id)| { + let recorder = recorder.clone(); + async move { + let user_clone = context.user.clone(); + + tokio::time::sleep(agent_stagger(idx)).await; + + let mut calls = 0u64; + while Instant::now() < deadline { + let result = invoke_and_await_agent( + &user_clone, + &context.component, + agent_id, + variant.active_method, + (variant.active_params)(), + ) + .await; + result.record(&recorder, "", idx.to_string().as_str()); + calls += 1; + tokio::time::sleep(IDLE_GAP).await; + } + calls + } + }) + .collect::>(); + + let started = Instant::now(); + let per_agent_calls = result_futures.join().await; + let elapsed = started.elapsed(); + + // Aggregate sustained throughput over the fixed run window. Across `size` + // steps, this reveals where added active agents stop adding throughput + // (memory saturation / eviction churn dominates) — the knee we are after. + let total_calls: u64 = per_agent_calls.iter().sum(); + let secs = elapsed.as_secs_f64(); + if secs > 0.0 { + let ops_per_sec = (total_calls as f64 / secs).round() as u64; + info!( + "saturation: {agent_count} agents, {total_calls} calls in {secs:.1}s = {ops_per_sec} ops/sec" + ); + recorder.count( + &ResultKey::primary("saturation-throughput-ops-per-sec"), + ops_per_sec, + ); + } +} + +async fn cleanup_iteration(context: SaturationIterationContext) { + let agent_ids: Vec = context + .agent_ids + .iter() + .filter_map(|agent_id| AgentId::from_agent_id(context.component.id, agent_id).ok()) + .collect(); + delete_workers(&context.user, &agent_ids).await; + cleanup_user_state(&context.user, &context.env_id).await; +} + +/// Generates a `Benchmark` impl wrapper for a saturation variant. +macro_rules! saturation_benchmark { + ($ty:ident, $bench_name:literal, $variant:expr, $description:literal) => { + pub struct $ty { + config: RunConfig, + } + + #[async_trait] + impl Benchmark for $ty { + type BenchmarkContext = SaturationBenchmarkContext; + type IterationContext = SaturationIterationContext; + + fn name() -> &'static str { + $bench_name + } + + fn description() -> &'static str { + indoc! { $description } + } + + async fn create_benchmark_context( + mode: &TestMode, + verbosity: Level, + cluster_size: usize, + disable_compilation_cache: bool, + otlp: bool, + ) -> Self::BenchmarkContext { + create_context( + mode, + verbosity, + cluster_size, + disable_compilation_cache, + otlp, + ) + .await + } + + async fn cleanup(benchmark_context: Self::BenchmarkContext) { + benchmark_context.deps.kill_all().await; + } + + async fn create(_mode: &TestMode, config: RunConfig) -> Self { + Self { config } + } + + async fn setup_iteration( + &self, + benchmark_context: &Self::BenchmarkContext, + ) -> Self::IterationContext { + setup_iteration(&$variant, &self.config, benchmark_context).await + } + + async fn warmup( + &self, + _benchmark_context: &Self::BenchmarkContext, + context: &Self::IterationContext, + ) { + warmup(&$variant, context).await + } + + async fn run( + &self, + _benchmark_context: &Self::BenchmarkContext, + context: &Self::IterationContext, + recorder: BenchmarkRecorder, + ) { + run(&$variant, context, recorder).await + } + + async fn cleanup_iteration( + &self, + _benchmark_context: &Self::BenchmarkContext, + context: Self::IterationContext, + ) { + cleanup_iteration(context).await + } + } + }; +} + +saturation_benchmark!( + ThroughputSaturationCounters, + "throughput-saturation-counters", + COUNTERS_VARIANT, + "Ramps `size` active agents that each retain a deterministic, per-agent-distinct + synthetic memory footprint (controlled by `length`) and do CPU work, measuring + sustained throughput to locate the memory-saturation knee." +); + +saturation_benchmark!( + ThroughputSaturationEchoRust, + "throughput-saturation-echo-rust", + ECHO_RUST_VARIANT, + "Ramps `size` actively-invoked Rust `echo` agents to find how many fit resident + per pod before eviction churn craters throughput. The per-agent footprint is the + agent's natural memory (no synthetic allocation)." +); + +saturation_benchmark!( + ThroughputSaturationEchoTs, + "throughput-saturation-echo-ts", + ECHO_TS_VARIANT, + "Ramps `size` actively-invoked TypeScript `echo` agents to find how many fit + resident per pod before eviction churn craters throughput. The per-agent + footprint is the agent's natural memory, including the QuickJS runtime." +); diff --git a/test-components/agent-counters/Cargo.toml b/test-components/agent-counters/Cargo.toml index c7567da5a5..069f9180f3 100644 --- a/test-components/agent-counters/Cargo.toml +++ b/test-components/agent-counters/Cargo.toml @@ -3,6 +3,12 @@ name = "it_agent_counters" version = "0.0.1" edition = "2024" +# Standalone workspace root: this component is excluded from the golem-oss +# workspace, and when built nested inside another repo's workspace (e.g. the +# cloud-perf CI checkout under golem-cloud) cargo would otherwise walk up and +# attach it to that unrelated workspace. An empty table stops that search. +[workspace] + [profile.release] opt-level = "s" lto = true diff --git a/test-components/agent-counters/src/lib.rs b/test-components/agent-counters/src/lib.rs index b2ac7d4d44..b14840512d 100644 --- a/test-components/agent-counters/src/lib.rs +++ b/test-components/agent-counters/src/lib.rs @@ -3,6 +3,43 @@ pub mod repository; use golem_rust::{agent_definition, agent_implementation, generate_idempotency_key}; +/// Page size used when touching retained memory so the OS backs it with real +/// resident pages rather than leaving it as untouched (non-resident) reservation. +const PAGE_SIZE: usize = 4096; + +/// Spins doing cheap arithmetic for approximately `millis` milliseconds, polling +/// the monotonic clock between batches of work rather than on every iteration so +/// the workload is CPU-bound, not clock-syscall-bound. Returns an accumulated +/// value so the work cannot be optimised away. +fn busy_loop(millis: u32) -> u32 { + let deadline = std::time::Duration::from_millis(millis as u64); + let start = std::time::Instant::now(); + let mut acc: u32 = 0; + loop { + for i in 0..10_000u32 { + acc = acc.wrapping_add(i).wrapping_mul(31).wrapping_add(7); + } + if start.elapsed() >= deadline { + break; + } + } + acc +} + +/// Grows `buffer` to hold `bytes` and touches one byte per page so the memory +/// becomes resident (real RSS), not just reserved address space. +fn retain_memory(buffer: &mut Vec, bytes: u32) { + let bytes = bytes as usize; + buffer.clear(); + buffer.shrink_to_fit(); + buffer.resize(bytes, 0); + let mut page = 0; + while page < bytes { + buffer[page] = buffer[page].wrapping_add(1); + page += PAGE_SIZE; + } +} + #[agent_definition] trait Counter { fn new(id: String) -> Self; @@ -10,17 +47,32 @@ trait Counter { async fn increment_through_rpc(&mut self) -> u32; async fn increment_through_rpc_to_ephemeral(&mut self) -> u32; async fn increment_through_rpc_to_ephemeral_phantom(&mut self) -> u32; + + /// Spins for `millis` milliseconds of cheap CPU work, then increments and + /// returns the counter. Used to define an "active" agent without making the + /// workload oplog-bound on a tight loop. + fn busy_for(&mut self, millis: u32) -> u32; + + /// Retains `bytes` of resident linear memory in the agent's state and + /// increments the counter. The memory stays resident across invocations so + /// the agent contributes a controllable footprint to the executor's pool. + fn allocate_memory(&mut self, bytes: u32) -> u32; } struct CounterImpl { count: u32, id: String, + retained: Vec, } #[agent_implementation] impl Counter for CounterImpl { fn new(id: String) -> Self { - Self { id, count: 0 } + Self { + id, + count: 0, + retained: Vec::new(), + } } fn increment(&mut self) -> u32 { @@ -42,29 +94,64 @@ impl Counter for CounterImpl { let mut client = EphemeralSingletonCounterClient::new_phantom(); client.increment().await } + + fn busy_for(&mut self, millis: u32) -> u32 { + let _ = busy_loop(millis); + self.count += 1; + self.count + } + + fn allocate_memory(&mut self, bytes: u32) -> u32 { + retain_memory(&mut self.retained, bytes); + self.count += 1; + self.count + } } #[agent_definition(ephemeral)] trait EphemeralCounter { fn new(id: String) -> Self; fn increment(&mut self) -> u32; + + /// See [`Counter::busy_for`]. + fn busy_for(&mut self, millis: u32) -> u32; + + /// See [`Counter::allocate_memory`]. + fn allocate_memory(&mut self, bytes: u32) -> u32; } struct EphemeralCounterImpl { count: u32, _id: String, + retained: Vec, } #[agent_implementation] impl EphemeralCounter for EphemeralCounterImpl { fn new(id: String) -> Self { - Self { _id: id, count: 0 } + Self { + _id: id, + count: 0, + retained: Vec::new(), + } } fn increment(&mut self) -> u32 { self.count += 1; self.count } + + fn busy_for(&mut self, millis: u32) -> u32 { + let _ = busy_loop(millis); + self.count += 1; + self.count + } + + fn allocate_memory(&mut self, bytes: u32) -> u32 { + retain_memory(&mut self.retained, bytes); + self.count += 1; + self.count + } }