Skip to content

Commit 263de2e

Browse files
authored
Merge branch 'master' into master
2 parents fecdbcc + 4a81904 commit 263de2e

14 files changed

+265
-201
lines changed

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ cmake --build . --config Release
148148
### Run
149149
150150
```
151-
usage: ./bin/sd [arguments]
151+
usage: ./build/bin/sd [arguments]
152152

153153
arguments:
154154
-h, --help show this help message and exit
@@ -161,6 +161,7 @@ arguments:
161161
--control-net [CONTROL_PATH] path to control net model
162162
--embd-dir [EMBEDDING_PATH] path to embeddings.
163163
--upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.
164+
--upscale-repeats Run the ESRGAN upscaler this many times (default 1)
164165
--type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
165166
If not specified, the default is the type of the weight file.
166167
--lora-model-dir [DIR] lora model directory
@@ -186,6 +187,7 @@ arguments:
186187
<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
187188
--vae-tiling process vae in tiles to reduce memory usage
188189
--control-net-cpu keep controlnet in cpu (for low vram)
190+
--canny apply canny preprocessor (edge detection)
189191
-v, --verbose print extra info
190192
```
191193

clip.hpp

+25-63
Original file line numberDiff line numberDiff line change
@@ -960,64 +960,32 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
960960
return hidden_states;
961961
}
962962

963-
struct ggml_cgraph* build_graph(struct ggml_allocr* allocr, std::vector<int> tokens, bool return_pooled = false) {
963+
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
964+
struct ggml_tensor* input_ids2 = NULL,
965+
size_t max_token_idx = 0,
966+
bool return_pooled = false) {
964967
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
965968

966-
struct ggml_tensor* input_ids = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, tokens.size());
967-
ggml_allocr_alloc(allocr, input_ids);
968-
969-
if (!ggml_allocr_is_measure(allocr)) {
970-
ggml_backend_tensor_set(input_ids, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids));
971-
}
972-
973-
struct ggml_tensor* input_ids2 = NULL;
974-
size_t max_token_idx = 0;
975-
if (version == VERSION_XL) {
976-
input_ids2 = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, tokens.size());
977-
ggml_allocr_alloc(allocr, input_ids2);
978-
979-
auto it = std::find(tokens.begin(), tokens.end(), EOS_TOKEN_ID);
980-
if (it != tokens.end()) {
981-
std::fill(std::next(it), tokens.end(), 0);
982-
}
983-
984-
max_token_idx = std::min<size_t>(std::distance(tokens.begin(), it), tokens.size() - 1);
985-
986-
// for (int i = 0; i < tokens.size(); i++) {
987-
// printf("%d ", tokens[i]);
988-
// }
989-
// printf("\n");
990-
991-
if (!ggml_allocr_is_measure(allocr)) {
992-
ggml_backend_tensor_set(input_ids2, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids2));
993-
}
969+
input_ids2 = to_backend(input_ids2);
970+
if (!return_pooled) {
971+
input_ids = to_backend(input_ids);
994972
}
995973

996974
struct ggml_tensor* embeddings = NULL;
997975

998976
if (num_custom_embeddings > 0 && version != VERSION_XL) {
999-
embeddings = ggml_new_tensor_2d(compute_ctx,
1000-
wtype,
1001-
text_model.hidden_size,
1002-
text_model.vocab_size + num_custom_embeddings /* custom placeholder */);
1003-
ggml_allocr_alloc(allocr, embeddings);
1004-
if (!ggml_allocr_is_measure(allocr)) {
1005-
// really bad, there is memory inflexibility (this is for host<->device memory conflicts)
1006-
auto token_embed_weight = text_model.get_token_embed_weight();
1007-
void* freeze_data = malloc(ggml_nbytes(token_embed_weight));
1008-
ggml_backend_tensor_get_and_sync(backend,
1009-
token_embed_weight,
1010-
freeze_data,
1011-
0,
1012-
ggml_nbytes(token_embed_weight));
1013-
ggml_backend_tensor_set(embeddings, freeze_data, 0, ggml_nbytes(token_embed_weight));
1014-
free(freeze_data);
1015-
// concatenate custom embeddings
1016-
ggml_backend_tensor_set(embeddings,
1017-
(const void*)token_embed_custom.data(),
1018-
ggml_nbytes(token_embed_weight),
1019-
num_custom_embeddings * text_model.hidden_size * ggml_type_size(wtype));
1020-
}
977+
auto custom_embeddings = ggml_new_tensor_3d(compute_ctx,
978+
wtype,
979+
text_model.hidden_size,
980+
1,
981+
num_custom_embeddings);
982+
set_backend_tensor_data(custom_embeddings, token_embed_custom.data());
983+
984+
auto token_embed_weight = text_model.get_token_embed_weight();
985+
token_embed_weight = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
986+
// concatenate custom embeddings
987+
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings);
988+
embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
1021989
}
1022990

1023991
struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, input_ids2, embeddings, max_token_idx, return_pooled);
@@ -1028,12 +996,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
1028996
}
1029997

1030998
void compute(const int n_threads,
1031-
std::vector<int> tokens,
999+
struct ggml_tensor* input_ids,
1000+
struct ggml_tensor* input_ids2,
1001+
size_t max_token_idx,
10321002
bool return_pooled,
10331003
ggml_tensor** output,
10341004
ggml_context* output_ctx = NULL) {
10351005
auto get_graph = [&]() -> struct ggml_cgraph* {
1036-
return build_graph(compute_allocr, tokens, return_pooled);
1006+
return build_graph(input_ids, input_ids2, max_token_idx, return_pooled);
10371007
};
10381008
GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
10391009
}
@@ -1147,8 +1117,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLModule {
11471117
vision_model.get_param_tensors(tensors, prefix + "transformer.visual_model");
11481118
}
11491119

1150-
struct ggml_cgraph* build_graph(struct ggml_allocr* allocr,
1151-
struct ggml_tensor* pixel_values) {
1120+
struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values) {
11521121
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
11531122

11541123
pixel_values = to_backend(pixel_values);
@@ -1160,19 +1129,12 @@ struct FrozenCLIPVisionEmbedder : public GGMLModule {
11601129
return gf;
11611130
}
11621131

1163-
void alloc_compute_buffer(ggml_context* work_ctx, ggml_tensor* pixel_values) {
1164-
auto get_graph = [&]() -> struct ggml_cgraph* {
1165-
return build_graph(compute_allocr, pixel_values);
1166-
};
1167-
GGMLModule::alloc_compute_buffer(get_graph);
1168-
}
1169-
11701132
void compute(const int n_threads,
11711133
ggml_tensor* pixel_values,
11721134
ggml_tensor** output,
11731135
ggml_context* output_ctx) {
11741136
auto get_graph = [&]() -> struct ggml_cgraph* {
1175-
return build_graph(compute_allocr, pixel_values);
1137+
return build_graph(pixel_values);
11761138
};
11771139
GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
11781140
}

control.hpp

+18-18
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,6 @@ class ControlNetBlock : public GGMLBlock {
166166

167167
struct ggml_tensor* resblock_forward(std::string name,
168168
struct ggml_context* ctx,
169-
struct ggml_allocr* allocr,
170169
struct ggml_tensor* x,
171170
struct ggml_tensor* emb) {
172171
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
@@ -175,7 +174,6 @@ class ControlNetBlock : public GGMLBlock {
175174

176175
struct ggml_tensor* attention_layer_forward(std::string name,
177176
struct ggml_context* ctx,
178-
struct ggml_allocr* allocr,
179177
struct ggml_tensor* x,
180178
struct ggml_tensor* context) {
181179
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
@@ -201,11 +199,10 @@ class ControlNetBlock : public GGMLBlock {
201199
}
202200

203201
std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
204-
struct ggml_allocr* allocr,
205202
struct ggml_tensor* x,
206203
struct ggml_tensor* hint,
207204
struct ggml_tensor* guided_hint,
208-
std::vector<float> timesteps,
205+
struct ggml_tensor* timesteps,
209206
struct ggml_tensor* context,
210207
struct ggml_tensor* y = NULL) {
211208
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
@@ -231,7 +228,7 @@ class ControlNetBlock : public GGMLBlock {
231228

232229
auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
233230

234-
auto t_emb = new_timestep_embedding(ctx, allocr, timesteps, model_channels); // [N, model_channels]
231+
auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels]
235232

236233
auto emb = time_embed_0->forward(ctx, t_emb);
237234
emb = ggml_silu_inplace(ctx, emb);
@@ -272,10 +269,10 @@ class ControlNetBlock : public GGMLBlock {
272269
for (int j = 0; j < num_res_blocks; j++) {
273270
input_block_idx += 1;
274271
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
275-
h = resblock_forward(name, ctx, allocr, h, emb); // [N, mult*model_channels, h, w]
272+
h = resblock_forward(name, ctx, h, emb); // [N, mult*model_channels, h, w]
276273
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
277274
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
278-
h = attention_layer_forward(name, ctx, allocr, h, context); // [N, mult*model_channels, h, w]
275+
h = attention_layer_forward(name, ctx, h, context); // [N, mult*model_channels, h, w]
279276
}
280277

281278
auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
@@ -299,9 +296,9 @@ class ControlNetBlock : public GGMLBlock {
299296
// [N, 4*model_channels, h/8, w/8]
300297

301298
// middle_block
302-
h = resblock_forward("middle_block.0", ctx, allocr, h, emb); // [N, 4*model_channels, h/8, w/8]
303-
h = attention_layer_forward("middle_block.1", ctx, allocr, h, context); // [N, 4*model_channels, h/8, w/8]
304-
h = resblock_forward("middle_block.2", ctx, allocr, h, emb); // [N, 4*model_channels, h/8, w/8]
299+
h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
300+
h = attention_layer_forward("middle_block.1", ctx, h, context); // [N, 4*model_channels, h/8, w/8]
301+
h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
305302

306303
// out
307304
outs.push_back(middle_block_out->forward(ctx, h));
@@ -386,18 +383,22 @@ struct ControlNet : public GGMLModule {
386383

387384
struct ggml_cgraph* build_graph(struct ggml_tensor* x,
388385
struct ggml_tensor* hint,
389-
std::vector<float> timesteps,
386+
struct ggml_tensor* timesteps,
390387
struct ggml_tensor* context,
391388
struct ggml_tensor* y = NULL) {
392389
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
393390

394-
x = to_backend(x);
395-
hint = to_backend(hint);
396-
context = to_backend(context);
397-
y = to_backend(y);
391+
x = to_backend(x);
392+
if (guided_hint_cached) {
393+
hint = NULL;
394+
} else {
395+
hint = to_backend(hint);
396+
}
397+
context = to_backend(context);
398+
y = to_backend(y);
399+
timesteps = to_backend(timesteps);
398400

399401
auto outs = control_net.forward(compute_ctx,
400-
compute_allocr,
401402
x,
402403
hint,
403404
guided_hint_cached ? guided_hint : NULL,
@@ -420,7 +421,7 @@ struct ControlNet : public GGMLModule {
420421
void compute(int n_threads,
421422
struct ggml_tensor* x,
422423
struct ggml_tensor* hint,
423-
std::vector<float> timesteps,
424+
struct ggml_tensor* timesteps,
424425
struct ggml_tensor* context,
425426
struct ggml_tensor* y,
426427
struct ggml_tensor** output = NULL,
@@ -434,7 +435,6 @@ struct ControlNet : public GGMLModule {
434435
};
435436

436437
GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
437-
438438
guided_hint_cached = true;
439439
}
440440

examples/cli/main.cpp

+44-8
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ struct SDParams {
9696
bool vae_tiling = false;
9797
bool control_net_cpu = false;
9898
bool canny_preprocess = false;
99+
int upscale_repeats = 1;
99100
};
100101

101102
void print_params(SDParams params) {
@@ -129,6 +130,7 @@ void print_params(SDParams params) {
129130
printf(" seed: %ld\n", params.seed);
130131
printf(" batch_count: %d\n", params.batch_count);
131132
printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false");
133+
printf(" upscale_repeats: %d\n", params.upscale_repeats);
132134
}
133135

134136
void print_usage(int argc, const char* argv[]) {
@@ -145,6 +147,7 @@ void print_usage(int argc, const char* argv[]) {
145147
printf(" --control-net [CONTROL_PATH] path to control net model\n");
146148
printf(" --embd-dir [EMBEDDING_PATH] path to embeddings.\n");
147149
printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n");
150+
printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n");
148151
printf(" --type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
149152
printf(" If not specified, the default is the type of the weight file.\n");
150153
printf(" --lora-model-dir [DIR] lora model directory\n");
@@ -296,6 +299,16 @@ void parse_args(int argc, const char** argv, SDParams& params) {
296299
break;
297300
}
298301
params.prompt = argv[i];
302+
} else if (arg == "--upscale-repeats") {
303+
if (++i >= argc) {
304+
invalid_arg = true;
305+
break;
306+
}
307+
params.upscale_repeats = std::stoi(argv[i]);
308+
if (params.upscale_repeats < 1) {
309+
fprintf(stderr, "error: upscale multiplier must be at least 1\n");
310+
exit(1);
311+
}
299312
} else if (arg == "-n" || arg == "--negative-prompt") {
300313
if (++i >= argc) {
301314
invalid_arg = true;
@@ -486,6 +499,18 @@ void parse_args(int argc, const char** argv, SDParams& params) {
486499
}
487500
}
488501

502+
static std::string sd_basename(const std::string& path) {
503+
size_t pos = path.find_last_of('/');
504+
if (pos != std::string::npos) {
505+
return path.substr(pos + 1);
506+
}
507+
pos = path.find_last_of('\\');
508+
if (pos != std::string::npos) {
509+
return path.substr(pos + 1);
510+
}
511+
return path;
512+
}
513+
489514
std::string get_image_params(SDParams params, int64_t seed) {
490515
std::string parameter_string = params.prompt + "\n";
491516
if (params.negative_prompt.size() != 0) {
@@ -618,7 +643,14 @@ int main(int argc, const char* argv[]) {
618643
input_image_buffer};
619644
if (params.canny_preprocess) { // apply preprocessor
620645
LOG_INFO("Applying canny preprocessor");
621-
control_image->data = preprocess_canny(control_image->data, control_image->width, control_image->height);
646+
control_image->data = preprocess_canny(control_image->data,
647+
control_image->width,
648+
control_image->height,
649+
0.08f,
650+
0.08f,
651+
0.8f,
652+
1.0f,
653+
false);
622654
}
623655
}
624656
results = txt2img(sd_ctx,
@@ -700,7 +732,7 @@ int main(int argc, const char* argv[]) {
700732
}
701733

702734
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
703-
if (params.esrgan_path.size() > 0) {
735+
if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
704736
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
705737
params.n_threads,
706738
params.wtype);
@@ -712,13 +744,17 @@ int main(int argc, const char* argv[]) {
712744
if (results[i].data == NULL) {
713745
continue;
714746
}
715-
sd_image_t upscaled_image = upscale(upscaler_ctx, results[i], upscale_factor);
716-
if (upscaled_image.data == NULL) {
717-
printf("upscale failed\n");
718-
continue;
747+
sd_image_t current_image = results[i];
748+
for (int u = 0; u < params.upscale_repeats; ++u) {
749+
sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor);
750+
if (upscaled_image.data == NULL) {
751+
printf("upscale failed\n");
752+
break;
753+
}
754+
free(current_image.data);
755+
current_image = upscaled_image;
719756
}
720-
free(results[i].data);
721-
results[i] = upscaled_image;
757+
results[i] = current_image; // Set the final upscaled image as the result
722758
}
723759
}
724760
}

ggml

Submodule ggml updated from 9a5ce30 to 4212b75

0 commit comments

Comments
 (0)