Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
981191f
feat: cloud-mode TestMode::Cloud for benchmarks with best-effort clea…
kmatasfp Jun 2, 2026
341bab3
feat: add run specific details to perf tests
kmatasfp Jun 3, 2026
b1764ec
fix(benchmark): make --builtin-plugin-owner-account-id and --default-…
kmatasfp Jun 4, 2026
4294bdb
fix: make ProvidedShardManager kill/restart no-ops instead of panics
kmatasfp Jun 5, 2026
5b9902b
feat(benchmark): enable all tests
kmatasfp Jun 5, 2026
742a669
feat: retry connectivity to shard manager
kmatasfp Jun 5, 2026
18d5af6
chore: fmt
kmatasfp Jun 5, 2026
395bcd2
investigation: run echo test first to see if they get stuck again
kmatasfp Jun 5, 2026
dac3c69
feat(benchmark): lower number of conccurent live apps
kmatasfp Jun 6, 2026
2256623
feat: more observability, make memory component coefficient configurable
kmatasfp Jun 6, 2026
02e527a
feat(benchmark): run only throughput-echo test
kmatasfp Jun 6, 2026
faeb651
feat(bench): try 200 apps after tuning
kmatasfp Jun 6, 2026
f8dd565
feat: try 250 again
kmatasfp Jun 6, 2026
1bf0063
feat(benchmark): run all the tests again
kmatasfp Jun 7, 2026
2e53af6
fix: metric description
kmatasfp Jun 7, 2026
32ef9e5
feat: proper load for our cluster
kmatasfp Jun 7, 2026
bc11779
feat(benchmark): run only benchmark tests
kmatasfp Jun 8, 2026
5347626
feat: enable all tests again
kmatasfp Jun 8, 2026
9e582a2
feat(benchmark): increase max number of concurrent compilations
kmatasfp Jun 8, 2026
e7b44bf
feat(worker-executor): add measured-headroom memory admission gate
kmatasfp Jun 8, 2026
817c672
feat(worker-executor): charge component module size once per resident…
kmatasfp Jun 9, 2026
35874d3
fix(worker-executor): disable measured admission when executor does n…
kmatasfp Jun 9, 2026
acb9968
feat(benchmark): add throughput-under-memory-saturation benchmarks
kmatasfp Jun 9, 2026
bfe1b14
test(worker-executor): exercise admission reserve under maximum concu…
kmatasfp Jun 9, 2026
c3af739
feat(benchmark): longer sustained load, bumpt the number of agents
kmatasfp Jun 9, 2026
7dcb2d3
fix: add empty workspace
kmatasfp Jun 9, 2026
139aed5
fix: use snake case as method names
kmatasfp Jun 9, 2026
442c1c5
chore: 300 already saturates, no need for 500
kmatasfp Jun 9, 2026
4bbb200
fix(worker-executor): avoid deadlock between memory grow and admissio…
kmatasfp Jun 9, 2026
be19cf4
feat: change order of tests
kmatasfp Jun 10, 2026
21fd401
feat: restore iterations count to 3
kmatasfp Jun 10, 2026
a9285c0
refactor(worker-executor): make cgroup gate primary, semaphore clampe…
kmatasfp Jun 10, 2026
27119b2
feat: run only initial echo test to make sure we did not make it slower
kmatasfp Jun 10, 2026
b608593
feat: run only saturation test
kmatasfp Jun 10, 2026
1f1b77a
feat: bigger saturation spread
kmatasfp Jun 10, 2026
1bd27ea
feat(benchmark): change the steps
kmatasfp Jun 10, 2026
898435d
feat: replace estimate-semaphore completely with measured-headroom ad…
kmatasfp Jun 10, 2026
8cecf91
fix: clippy warnings
kmatasfp Jun 10, 2026
8566f13
fix: startup message regarding memory
kmatasfp Jun 11, 2026
626e4ba
chore: run only oom test
kmatasfp Jun 11, 2026
2434047
feat: enable to whole perf test suite
kmatasfp Jun 11, 2026
a8fcf52
feat: more metrics plus FixedProbe for tests
kmatasfp Jun 11, 2026
7eb6f08
fix: make admission gate reserve atomic to prevent ceiling overshoot
kmatasfp Jun 11, 2026
83a6b2f
test: gate concurrent-agent permit tests with a semaphore, not Notify
kmatasfp Jun 11, 2026
24673f6
feat: expose tokio metrics
kmatasfp Jun 11, 2026
a1928c5
fix: prevent concurrent-agent scheduler deadlock on cancel-after-grant
kmatasfp Jun 12, 2026
183de28
feat: use official tokio-metrics crate to expose tokio runtime metrics
kmatasfp Jun 12, 2026
78d311d
feat: use official tokio-metrics crate to expose tokio runtime metric…
kmatasfp Jun 12, 2026
aec411b
chore: cleanup comments
kmatasfp Jun 12, 2026
71bee78
feat: try mimalloc
kmatasfp Jun 13, 2026
b146823
feat: try mimalloc vol 2
kmatasfp Jun 13, 2026
0ef2c16
perf: enable thin LTO and codegen-units=1 for release builds
kmatasfp Jun 13, 2026
99e3633
perf: pin target-cpu baseline for published images (x86-64-v3, neover…
kmatasfp Jun 13, 2026
3218b4f
perf: drop codegen-units=1, keep thin LTO
kmatasfp Jun 13, 2026
ac3b3ee
chore: lower number of cuncurrent agents to 200 in case of durability…
kmatasfp Jun 14, 2026
ee49d86
feat: restore 3 iterations
kmatasfp Jun 14, 2026
e0bdf30
fix: release worker memory grant on cancelled start
kmatasfp Jun 14, 2026
4f4a5ca
feat: move tests around
kmatasfp Jun 14, 2026
4cfd2e7
debug: add more logging to figure out why we leak memory and deadlock
kmatasfp Jun 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -836,10 +836,14 @@ jobs:
- platform: linux/amd64
name: linux/amd64
target: x86_64-unknown-linux-gnu
target_cpu_env: CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS
target_cpu: "-C target-cpu=x86-64-v3"
- platform: linux/arm64
name: linux/arm64
target: aarch64-unknown-linux-gnu
cross: true
target_cpu_env: CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUSTFLAGS
target_cpu: "-C target-cpu=neoverse-n1"
name: docker-targets-build (${{ matrix.platform.platform }})
steps:
- uses: actions/checkout@v5
Expand All @@ -854,6 +858,12 @@ jobs:
run: |
platform=${{ matrix.platform.platform }}
echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
# Target-scoped rustflags pin the instruction-set baseline for the
# published images: x86-64-v3 on amd64, Neoverse-N1 on arm64. The
# per-target CARGO_TARGET_*_RUSTFLAGS form is used because plain
# RUSTFLAGS is ignored when cross-compiling; CARGO_-prefixed vars are
# also passed through into the cross container automatically.
echo "${{ matrix.platform.target_cpu_env }}=${{ matrix.platform.target_cpu }}" >> $GITHUB_ENV
- run: cargo install cross
if: ${{ matrix.platform.cross }}
Expand Down
123 changes: 123 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,13 @@ log = "0.4.26"
mac_address = "1.1.8"
mappable-rc = "0.1.1"
md5 = "0.7.0"
metrics = "0.24.2"
metrics-exporter-prometheus = { version = "0.16.2", default-features = false }
metrohash = "1.0.7"
miette = { version = "7.6.0", features = ["fancy"] }
mime = "0.3.17"
mime_guess = "2.0.5"
mimalloc = "0.1.52"
minijinja = "2.7.0"

nanoid = "0.4.0"
Expand Down Expand Up @@ -248,6 +251,7 @@ textwrap = "0.16.1"
thiserror = "2.0.12"
time = { version = "0.3.41", features = ["default", "macros"] }
tokio = { version = "1.44", features = ["macros", "rt-multi-thread", "sync", "io-std", "net", "tracing", "process", "signal"] }
tokio-metrics = { version = "0.5.0", features = ["metrics-rs-integration"] }
tokio-postgres = "0.7.13"
tokio-rustls = { version = "0.26.2" }
tokio-stream = { version = "0.1", features = ["sync"] }
Expand Down Expand Up @@ -337,6 +341,7 @@ debug = "line-tables-only"

[profile.release]
panic = "abort"
lto = "thin"

[profile.benchmarks]
inherits = "release"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
#GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
Expand Down Expand Up @@ -228,6 +230,8 @@ GOLEM__LIMITS__MAX_CONCURRENT_STREAMS=1024
GOLEM__LIMITS__MAX_INVOCATION_CONTEXT_STACK_DEPTH=1024
GOLEM__LIMITS__MAX_OPLOG_QUERY_PAGES_SIZE=100
GOLEM__MEMORY__ACQUIRE_RETRY_DELAY="500ms"
GOLEM__MEMORY__COMPONENT_SIZE_COEFFICIENT=2.0
GOLEM__MEMORY__ENABLE_MEASURED_ADMISSION=true
#GOLEM__MEMORY__SYSTEM_MEMORY_OVERRIDE=
GOLEM__MEMORY__WORKER_ESTIMATE_COEFFICIENT=1.1
GOLEM__MEMORY__WORKER_MEMORY_RATIO=0.8
Expand Down
4 changes: 4 additions & 0 deletions golem-debugging-service/config/debug-worker-executor.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ max_oplog_query_pages_size = 100

[memory]
acquire_retry_delay = "500ms"
component_size_coefficient = 2.0
enable_measured_admission = true
worker_estimate_coefficient = 1.1
worker_memory_ratio = 0.8

Expand Down Expand Up @@ -364,6 +366,8 @@ without_time = false
#
# [memory]
# acquire_retry_delay = "500ms"
# component_size_coefficient = 2.0
# enable_measured_admission = true
# worker_estimate_coefficient = 1.1
# worker_memory_ratio = 0.8
#
Expand Down
1 change: 1 addition & 0 deletions golem-debugging-service/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ impl DebugConfig {
max_in_function_retry_delay: std::time::Duration::from_secs(20),
max_websocket_connections: 100,
quota_service: QuotaServiceConfig::default(),
runtime_metrics_sampling_interval: std::time::Duration::from_secs(5),
}
}
}
Expand Down
16 changes: 11 additions & 5 deletions golem-debugging-service/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,13 +375,19 @@ pub async fn run_debug_worker_executor<T: Bootstrap<DebugContext> + ?Sized + Sen
) -> anyhow::Result<RunDetails> {
debug!("Initializing debug worker executor");

let total_system_memory = golem_config.memory.total_system_memory();
let system_memory = golem_config.memory.system_memory();
let worker_memory = golem_config.memory.worker_memory();
let memory_snapshot =
golem_worker_executor::services::active_workers::memory_probe::default_probe(
golem_config.memory.system_memory_override,
)
.snapshot();
let total_system_memory = memory_snapshot.limit_bytes;
let used_system_memory = memory_snapshot.current_bytes;
let worker_memory =
(total_system_memory as f64 * golem_config.memory.worker_memory_ratio) as u64;
info!(
"Total system memory: {}, Available system memory: {}, Total memory available for workers: {}",
"Measured memory limit: {}, Currently used: {}, Usable for workers: {}",
ISizeFormatter::new(total_system_memory, humansize::BINARY),
ISizeFormatter::new(system_memory, humansize::BINARY),
ISizeFormatter::new(used_system_memory, humansize::BINARY),
ISizeFormatter::new(worker_memory, humansize::BINARY)
);

Expand Down
29 changes: 27 additions & 2 deletions golem-service-base/src/observability.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,42 @@ use axum::response::IntoResponse;
use axum::routing::get;
use http::Response;
use prometheus::{Encoder, Registry, TextEncoder};
use std::sync::Arc;
use tokio::net::{TcpListener, ToSocketAddrs};
use tokio::task::JoinSet;
use tracing::{Instrument, info};

/// A callback that renders additional metrics in Prometheus text exposition
/// format, appended to the output of the `prometheus`-crate registry on the
/// `/metrics` endpoint. Used to surface metrics from a second metrics façade
/// (e.g. the `metrics`-crate recorder driving tokio-metrics) on the same
/// scrape endpoint.
pub type ExtraMetrics = Arc<dyn Fn() -> String + Send + Sync>;

pub async fn start_health_and_metrics_server(
addr: impl ToSocketAddrs,
registry: Registry,
body_message: &'static str,
join_set: &mut JoinSet<Result<(), anyhow::Error>>,
) -> Result<u16, anyhow::Error> {
start_health_and_metrics_server_with_extra(addr, registry, None, body_message, join_set).await
}

pub async fn start_health_and_metrics_server_with_extra(
addr: impl ToSocketAddrs,
registry: Registry,
extra: Option<ExtraMetrics>,
body_message: &'static str,
join_set: &mut JoinSet<Result<(), anyhow::Error>>,
) -> Result<u16, anyhow::Error> {
let app = Router::new()
.route("/healthcheck", get(move || async move { body_message }))
.route(
"/metrics",
get(|| async move { prometheus_metrics(registry.clone()) }),
get(move || {
let extra = extra.clone();
async move { prometheus_metrics(registry.clone(), extra) }
}),
);

let listener = TcpListener::bind(addr).await?;
Expand All @@ -51,13 +72,17 @@ pub async fn start_health_and_metrics_server(
Ok(local_addr.port())
}

pub fn prometheus_metrics(registry: Registry) -> impl IntoResponse {
pub fn prometheus_metrics(registry: Registry, extra: Option<ExtraMetrics>) -> impl IntoResponse {
let encoder = TextEncoder::new();
let mut buffer = Vec::new();

let metric_families = registry.gather();
encoder.encode(&metric_families, &mut buffer).unwrap();

if let Some(extra) = extra {
buffer.extend_from_slice(extra().as_bytes());
}

Response::builder()
.header("Content-Type", encoder.format_type())
.body(Body::from(buffer))
Expand Down
Loading
Loading