From 466698a988394ff278916ea48e0cb660c73c67de Mon Sep 17 00:00:00 2001 From: fszontagh Date: Sat, 13 Jun 2026 09:41:15 +0200 Subject: [PATCH] perf: --eager-load-params for fast steady-state streaming --- examples/common/common.cpp | 6 ++++++ examples/common/common.h | 1 + include/stable-diffusion.h | 1 + src/model_manager.cpp | 11 +++++++++++ src/model_manager.h | 1 + src/stable-diffusion.cpp | 16 ++++++++++++++-- 6 files changed, 34 insertions(+), 2 deletions(-) diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 3ae5faba7..b27a9ee36 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -449,6 +449,10 @@ ArgOptions SDContextParams::get_options() { "--stream-layers", "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)", true, &stream_layers}, + {"", + "--eager-load-params", + "load all model params into the params backend up front instead of lazily on first use (faster steady-state; higher load-time cost)", + true, &eager_load_params}, {"", "--force-sdxl-vae-conv-scale", "force use of conv scale on sdxl vae", @@ -733,6 +737,7 @@ std::string SDContextParams::to_string() const { << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" << " max_vram: " << max_vram << ",\n" << " stream_layers: " << (stream_layers ? "true" : "false") << ",\n" + << " eager_load_params: " << (eager_load_params ? "true" : "false") << ",\n" << " backend: \"" << backend << "\",\n" << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" @@ -815,6 +820,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f str_to_vae_format(vae_format), max_vram, stream_layers, + eager_load_params, backend.c_str(), params_backend.c_str(), }; diff --git a/examples/common/common.h b/examples/common/common.h index a90a33132..0a5b1185b 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -146,6 +146,7 @@ struct SDContextParams { bool offload_params_to_cpu = false; float max_vram = 0.f; bool stream_layers = false; + bool eager_load_params = false; std::string backend; std::string params_backend; bool enable_mmap = false; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 2175f895a..f81369a72 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -224,6 +224,7 @@ typedef struct { enum sd_vae_format_t vae_format; float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB) bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) + bool eager_load_params; // Load all model params into the params backend at model-load time instead of lazily on first use const char* backend; const char* params_backend; } sd_ctx_params_t; diff --git a/src/model_manager.cpp b/src/model_manager.cpp index 328a478bb..5ef07573a 100644 --- a/src/model_manager.cpp +++ b/src/model_manager.cpp @@ -147,6 +147,17 @@ bool ModelManager::register_param_tensors(const std::string& desc, return true; } +bool ModelManager::load_all_params_eagerly() { + std::vector all_states; + all_states.reserve(tensor_states_.size()); + for (const auto& s : tensor_states_) { + if (s != nullptr) { + all_states.push_back(s.get()); + } + } + return load_tensors_to_params_backend(all_states); +} + bool ModelManager::validate_registered_tensors() { bool ok = true; for (const auto& state : tensor_states_) { diff --git a/src/model_manager.h b/src/model_manager.h index b3da8a36a..3c37ab7ed 100644 --- a/src/model_manager.h +++ b/src/model_manager.h @@ -122,6 +122,7 @@ class ModelManager : public RunnerWeightManager { ggml_backend_t params_backend, size_t* registered_tensor_size = nullptr); bool validate_registered_tensors(); + bool load_all_params_eagerly(); bool prepare_params(const std::vector& tensors) override; void release_compute_backend_params(const std::vector& tensors) override; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 04f0598c0..6d0df2430 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -192,6 +192,7 @@ class StableDiffusionGGML { bool enable_mmap = false; float max_vram = 0.f; bool stream_layers = false; + bool eager_load_params = false; std::string backend_spec; std::string params_backend_spec; @@ -326,6 +327,7 @@ class StableDiffusionGGML { enable_mmap = sd_ctx_params->enable_mmap; max_vram = sd_ctx_params->max_vram; stream_layers = sd_ctx_params->stream_layers; + eager_load_params = sd_ctx_params->eager_load_params; backend_spec = SAFE_STR(sd_ctx_params->backend); params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); if (stream_layers && max_vram == 0.f) { @@ -1107,8 +1109,15 @@ class StableDiffusionGGML { LOG_ERROR("model metadata validation failed"); return false; } - - LOG_DEBUG("model metadata validated; weights will be prepared lazily"); + if (eager_load_params) { + if (!model_manager->load_all_params_eagerly()) { + LOG_ERROR("eager params load failed"); + return false; + } + LOG_DEBUG("model metadata validated; weights pre-loaded to params backend"); + } else { + LOG_DEBUG("model metadata validated; weights will be prepared lazily"); + } { size_t total_params_ram_size = 0; @@ -2637,6 +2646,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->offload_params_to_cpu = false; sd_ctx_params->max_vram = 0.f; sd_ctx_params->stream_layers = false; + sd_ctx_params->eager_load_params = false; sd_ctx_params->enable_mmap = false; sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false; @@ -2686,6 +2696,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "offload_params_to_cpu: %s\n" "max_vram: %.3f\n" "stream_layers: %s\n" + "eager_load_params: %s\n" "backend: %s\n" "params_backend: %s\n" "keep_clip_on_cpu: %s\n" @@ -2726,6 +2737,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { BOOL_STR(sd_ctx_params->offload_params_to_cpu), sd_ctx_params->max_vram, BOOL_STR(sd_ctx_params->stream_layers), + BOOL_STR(sd_ctx_params->eager_load_params), SAFE_STR(sd_ctx_params->backend), SAFE_STR(sd_ctx_params->params_backend), BOOL_STR(sd_ctx_params->keep_clip_on_cpu),