Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ enum SDVersion {
VERSION_LONGCAT,
VERSION_PID,
VERSION_IDEOGRAM4,
VERSION_ESRGAN,
VERSION_COUNT,
};

Expand Down
30 changes: 15 additions & 15 deletions src/model/diffusion/ideogram4.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,11 @@ namespace Ideogram4 {
}

return Rope::embed_interleaved_mrope(ids,
bs,
static_cast<float>(rope_theta),
head_dim,
mrope_section,
axis_wrap_dims);
bs,
static_cast<float>(rope_theta),
head_dim,
mrope_section,
axis_wrap_dims);
}

class Ideogram4Attention : public GGMLBlock {
Expand Down Expand Up @@ -505,16 +505,16 @@ namespace Ideogram4 {
int64_t head_dim = config.emb_dim / config.num_heads;

auto runner_ctx = get_context();
pe_vec = gen_ideogram4_pe(static_cast<int>(grid_h),
static_cast<int>(grid_w),
static_cast<int>(x->ne[3]),
static_cast<int>(context_len),
static_cast<int>(head_dim),
static_cast<int>(config.rope_theta),
config.mrope_section,
runner_ctx.circular_x_enabled,
runner_ctx.circular_y_enabled);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
pe_vec = gen_ideogram4_pe(static_cast<int>(grid_h),
static_cast<int>(grid_w),
static_cast<int>(x->ne[3]),
static_cast<int>(context_len),
static_cast<int>(head_dim),
static_cast<int>(config.rope_theta),
config.mrope_section,
runner_ctx.circular_x_enabled,
runner_ctx.circular_y_enabled);
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
set_backend_tensor_data(pe, pe_vec.data());

image_indicator_vec.assign(static_cast<size_t>(pos_len), 1);
Expand Down
308 changes: 102 additions & 206 deletions src/model/upscaler/esrgan.hpp

Large diffs are not rendered by default.

230 changes: 103 additions & 127 deletions src/model/upscaler/ltx_latent_upscaler.hpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#ifndef __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__
#define __SD_MODEL_UPSCALER_LTX_LATENT_UPSCALER_HPP__

#include <algorithm>
#include <cinttypes>
#include <cmath>
#include <cstdlib>
#include <map>
#include <memory>
#include <set>
Expand Down Expand Up @@ -32,90 +32,100 @@ namespace LTXVUpsampler {
int spatial_up_num = 2;
int spatial_down_den = 1;
int temporal_up_factor = 1;
};

static inline bool has_tensor(const String2TensorStorage& tensor_storage_map,
const std::string& name) {
return tensor_storage_map.find(name) != tensor_storage_map.end();
}

static inline int64_t get_tensor_ne(const String2TensorStorage& tensor_storage_map,
const std::string& name,
int axis,
int64_t fallback) {
auto it = tensor_storage_map.find(name);
if (it == tensor_storage_map.end() || axis < 0 || axis >= GGML_MAX_DIMS) {
return fallback;
}
return it->second.ne[axis];
}

static inline int64_t get_tensor_ne0(const String2TensorStorage& tensor_storage_map,
const std::string& name,
int64_t fallback) {
return get_tensor_ne(tensor_storage_map, name, 0, fallback);
}

static inline int count_module_blocks(const String2TensorStorage& tensor_storage_map,
const std::string& module_name) {
int max_block = -1;
const std::string prefix = module_name + ".";
for (const auto& pair : tensor_storage_map) {
const std::string& name = pair.first;
if (name.find(prefix) != 0) {
continue;
static LatentUpsamplerConfig detect_from_weights(const String2TensorStorage& tensor_storage_map,
const std::string& prefix = "") {
LatentUpsamplerConfig config;
auto find_weight = [&](const std::string& suffix) -> const TensorStorage* {
std::string name = prefix.empty() ? suffix : prefix + "." + suffix;
auto iter = tensor_storage_map.find(name);
if (iter == tensor_storage_map.end()) {
return nullptr;
}
return &iter->second;
};

bool inferred = false;

const TensorStorage* initial_norm = find_weight("initial_norm.weight");
if (initial_norm != nullptr) {
config.mid_channels = initial_norm->ne[0];
inferred = true;
}
size_t begin = prefix.size();
size_t end = name.find('.', begin);
if (end == std::string::npos) {
continue;

const TensorStorage* final_conv = find_weight("final_conv.bias");
if (final_conv != nullptr) {
config.in_channels = final_conv->ne[0];
inferred = true;
}
int index = atoi(name.substr(begin, end - begin).c_str());
max_block = std::max(max_block, index);
}
return max_block + 1;
}

static inline LatentUpsamplerConfig detect_config_from_weights(const String2TensorStorage& tensor_storage_map) {
LatentUpsamplerConfig config;
config.mid_channels = get_tensor_ne0(tensor_storage_map, "initial_norm.weight", config.mid_channels);
config.in_channels = get_tensor_ne0(tensor_storage_map, "final_conv.bias", config.in_channels);
int detected_blocks = count_module_blocks(tensor_storage_map, "res_blocks");
if (detected_blocks > 0) {
config.num_blocks_per_stage = detected_blocks;
}
config.rational_resampler = has_tensor(tensor_storage_map, "upsampler.conv.weight");
int64_t upsampler_out_channels = get_tensor_ne0(tensor_storage_map, "upsampler.0.bias", 0);
config.spatial_upsample = config.rational_resampler || upsampler_out_channels == 4 * config.mid_channels;
config.temporal_upsample = upsampler_out_channels == 2 * config.mid_channels;
if (config.temporal_upsample) {
config.temporal_up_factor = 2;
}
if (config.rational_resampler) {
int64_t out_channels = get_tensor_ne(tensor_storage_map,
"upsampler.conv.weight",
3,
config.mid_channels * 9);
if (config.mid_channels > 0 && out_channels % config.mid_channels == 0) {
int64_t ratio = out_channels / config.mid_channels;
int num = static_cast<int>(std::round(std::sqrt(static_cast<double>(ratio))));
if (num > 0 && static_cast<int64_t>(num) * num == ratio) {
config.spatial_up_num = num;
int detected_blocks = 0;
const std::string res_blocks_prefix = prefix.empty() ? "res_blocks." : prefix + ".res_blocks.";
for (const auto& [name, _] : tensor_storage_map) {
if (!starts_with(name, res_blocks_prefix)) {
continue;
}
size_t begin = res_blocks_prefix.size();
size_t end = name.find('.', begin);
if (end == std::string::npos) {
continue;
}
try {
int idx = std::stoi(name.substr(begin, end - begin));
detected_blocks = std::max(detected_blocks, idx + 1);
} catch (...) {
}
}
if (config.spatial_up_num == 3) {
config.spatial_down_den = 2;
config.spatial_scale = 1.5f;
} else if (config.spatial_up_num == 4) {
config.spatial_down_den = 1;
config.spatial_scale = 4.f;
} else {
config.spatial_down_den = 1;
config.spatial_scale = static_cast<float>(config.spatial_up_num);
if (detected_blocks > 0) {
config.num_blocks_per_stage = detected_blocks;
inferred = true;
}

const TensorStorage* rational_upsampler_weight = find_weight("upsampler.conv.weight");
const TensorStorage* upsampler_bias = find_weight("upsampler.0.bias");
config.rational_resampler = rational_upsampler_weight != nullptr;
int64_t upsampler_out_channels = upsampler_bias == nullptr ? 0 : upsampler_bias->ne[0];
config.spatial_upsample = config.rational_resampler || upsampler_out_channels == 4 * config.mid_channels;
config.temporal_upsample = upsampler_out_channels == 2 * config.mid_channels;
if (config.rational_resampler || upsampler_out_channels > 0) {
inferred = true;
}
if (config.temporal_upsample) {
config.temporal_up_factor = 2;
}
if (rational_upsampler_weight != nullptr) {
int64_t out_channels = rational_upsampler_weight->ne[3];
if (config.mid_channels > 0 && out_channels % config.mid_channels == 0) {
int64_t ratio = out_channels / config.mid_channels;
int num = static_cast<int>(std::round(std::sqrt(static_cast<double>(ratio))));
if (num > 0 && static_cast<int64_t>(num) * num == ratio) {
config.spatial_up_num = num;
}
}
if (config.spatial_up_num == 3) {
config.spatial_down_den = 2;
config.spatial_scale = 1.5f;
} else if (config.spatial_up_num == 4) {
config.spatial_down_den = 1;
config.spatial_scale = 4.f;
} else {
config.spatial_down_den = 1;
config.spatial_scale = static_cast<float>(config.spatial_up_num);
}
}

if (inferred) {
LOG_DEBUG("ltx latent upsampler: in_channels = %" PRId64 ", mid_channels = %" PRId64 ", num_blocks_per_stage = %d, spatial_scale = %.3f, temporal_up_factor = %d, rational_resampler = %d",
config.in_channels,
config.mid_channels,
config.num_blocks_per_stage,
config.spatial_scale,
config.temporal_up_factor,
config.rational_resampler);
}
return config;
}
return config;
}
};

class VideoGroupNorm : public GGMLBlock {
protected:
Expand Down Expand Up @@ -419,34 +429,14 @@ namespace LTXVUpsampler {
};

struct LatentUpsamplerRunner : public GGMLRunner {
LatentUpsamplerConfig config;
std::unique_ptr<LatentUpsampler> model;

LatentUpsamplerRunner(ggml_backend_t backend,
ggml_backend_t params_backend)
: GGMLRunner(backend, params_backend) {}

std::string get_desc() override {
return "ltx_latent_upsampler";
}

bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading LTX latent upsampler from '%s'", file_path.c_str());
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init LTX latent upsampler model loader from file failed: '%s'", file_path.c_str());
return false;
}

const auto& tensor_storage_map = model_loader.get_tensor_storage_map();
bool has_regular_upsampler = has_tensor(tensor_storage_map, "upsampler.0.weight");
bool has_rational_spatial = has_tensor(tensor_storage_map, "upsampler.conv.weight");
if (!has_tensor(tensor_storage_map, "post_upsample_res_blocks.0.conv2.bias") ||
(!has_regular_upsampler && !has_rational_spatial)) {
LOG_ERROR("unsupported LTX latent upsampler weights: expected upsampler tensors");
return false;
}

LatentUpsamplerConfig config = detect_config_from_weights(tensor_storage_map);
ggml_backend_t params_backend,
const String2TensorStorage& tensor_storage_map)
: GGMLRunner(backend, params_backend),
config(LatentUpsamplerConfig::detect_from_weights(tensor_storage_map)) {
if (config.dims != 3 || (!config.spatial_upsample && !config.temporal_upsample) ||
config.spatial_up_num < 1 || config.spatial_down_den < 1 || config.temporal_up_factor < 1) {
LOG_ERROR("unsupported LTX latent upsampler config: dims=%d spatial=%d temporal=%d rational=%d scale=%.3f temporal_factor=%d",
Expand All @@ -456,35 +446,21 @@ namespace LTXVUpsampler {
config.rational_resampler,
config.spatial_scale,
config.temporal_up_factor);
return false;
return;
}

model = std::make_unique<LatentUpsampler>(config);
model->init(params_ctx, tensor_storage_map, "");
if (!alloc_params_buffer()) {
LOG_ERROR("LTX latent upsampler params buffer allocation failed");
return false;
}
}

std::map<std::string, ggml_tensor*> tensors;
model->get_param_tensors(tensors);
std::set<std::string> ignore_tensors;
if (config.rational_resampler) {
ignore_tensors.insert("upsampler.blur_down.kernel");
}
model_loader.set_n_threads(n_threads);
if (!model_loader.load_tensors(tensors, ignore_tensors)) {
LOG_ERROR("load LTX latent upsampler tensors failed");
return false;
std::string get_desc() override {
return "ltx_latent_upsampler";
}

void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) {
if (model) {
model->get_param_tensors(tensors);
}
LOG_INFO("LTX latent upsampler loaded: in_channels=%" PRId64 ", mid_channels=%" PRId64 ", blocks=%d, scale=%.3f, temporal_factor=%d, rational=%d",
config.in_channels,
config.mid_channels,
config.num_blocks_per_stage,
config.spatial_scale,
config.temporal_up_factor,
config.rational_resampler);
return true;
}

ggml_cgraph* build_graph(const sd::Tensor<float>& x_tensor) {
Expand Down Expand Up @@ -515,9 +491,9 @@ namespace LTXVUpsampler {
(long long)x.shape()[4]);
return {};
}
if (x.shape()[3] != model->config.in_channels) {
if (x.shape()[3] != config.in_channels) {
LOG_ERROR("LTX latent upsampler expected %" PRId64 " channels, got %lld",
model->config.in_channels,
config.in_channels,
(long long)x.shape()[3]);
return {};
}
Expand Down
39 changes: 39 additions & 0 deletions src/name_conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,46 @@ bool is_first_stage_model_name(const std::string& name) {
return false;
}

static std::string convert_esrgan_tensor_name(std::string name) {
static std::unordered_map<std::string, std::string> esrgan_name_map;

if (esrgan_name_map.empty()) {
esrgan_name_map["model.0."] = "conv_first.";

constexpr int max_num_blocks = 64;
for (int i = 0; i < max_num_blocks; i++) {
std::string block_prefix = "model.1.sub." + std::to_string(i) + ".";
for (int rdb = 1; rdb <= 3; rdb++) {
for (int conv = 1; conv <= 5; conv++) {
esrgan_name_map[block_prefix + "RDB" + std::to_string(rdb) + ".conv" + std::to_string(conv) + ".0."] =
"body." + std::to_string(i) + ".rdb" + std::to_string(rdb) + ".conv" + std::to_string(conv) + ".";
}
}
esrgan_name_map[block_prefix + "weight"] = "conv_body.weight";
esrgan_name_map[block_prefix + "bias"] = "conv_body.bias";
}

// RealESRGAN stores only the learned layers in a Sequential. These indices
// cover the common x1, x2 and x4 layouts.
esrgan_name_map["model.2."] = "conv_hr.";
esrgan_name_map["model.3."] = "conv_up1.";
esrgan_name_map["model.4."] = "conv_last.";
esrgan_name_map["model.5."] = "conv_hr.";
esrgan_name_map["model.6."] = "conv_up2.";
esrgan_name_map["model.7."] = "conv_last.";
esrgan_name_map["model.8."] = "conv_hr.";
esrgan_name_map["model.10."] = "conv_last.";
}

replace_with_prefix_map(name, esrgan_name_map);
return name;
}

std::string convert_tensor_name(std::string name, SDVersion version) {
if (version == VERSION_ESRGAN) {
return convert_esrgan_tensor_name(std::move(name));
}

bool is_lora = false;
bool is_lycoris_underline = false;
bool is_underline = false;
Expand Down
Loading
Loading