leejet
diff --git a/‎README.md
+3-1 b/‎README.md
+3-1
diff --git a/‎clip.hpp
+25-63 b/‎clip.hpp
+25-63
diff --git a/‎control.hpp
+18-18 b/‎control.hpp
+18-18
diff --git a/‎examples/cli/main.cpp
+44-8 b/‎examples/cli/main.cpp
+44-8
diff --git a/‎ggml b/‎ggml
@@ -148,7 +148,7 @@ cmake --build . --config Release
 ### Run
 
 ```
-usage: ./bin/sd [arguments]
+usage: ./build/bin/sd [arguments]
 
 arguments:
   -h, --help                         show this help message and exit
@@ -161,6 +161,7 @@ arguments:
   --control-net [CONTROL_PATH]       path to control net model
   --embd-dir [EMBEDDING_PATH]        path to embeddings.
   --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.
+  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
   --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
                                      If not specified, the default is the type of the weight file.
   --lora-model-dir [DIR]             lora model directory
@@ -186,6 +187,7 @@ arguments:
                                      <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
   --vae-tiling                       process vae in tiles to reduce memory usage
   --control-net-cpu                  keep controlnet in cpu (for low vram)
+  --canny                            apply canny preprocessor (edge detection)
   -v, --verbose                      print extra info
 ```
 
 
@@ -960,64 +960,32 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
         return hidden_states;
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_allocr* allocr, std::vector<int> tokens, bool return_pooled = false) {
+    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
+                                    struct ggml_tensor* input_ids2 = NULL,
+                                    size_t max_token_idx           = 0,
+                                    bool return_pooled             = false) {
         struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
 
-        struct ggml_tensor* input_ids = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, tokens.size());
-        ggml_allocr_alloc(allocr, input_ids);
-
-        if (!ggml_allocr_is_measure(allocr)) {
-            ggml_backend_tensor_set(input_ids, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids));
-        }
-
-        struct ggml_tensor* input_ids2 = NULL;
-        size_t max_token_idx           = 0;
-        if (version == VERSION_XL) {
-            input_ids2 = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, tokens.size());
-            ggml_allocr_alloc(allocr, input_ids2);
-
-            auto it = std::find(tokens.begin(), tokens.end(), EOS_TOKEN_ID);
-            if (it != tokens.end()) {
-                std::fill(std::next(it), tokens.end(), 0);
-            }
-
-            max_token_idx = std::min<size_t>(std::distance(tokens.begin(), it), tokens.size() - 1);
-
-            // for (int i = 0; i < tokens.size(); i++) {
-            //     printf("%d ", tokens[i]);
-            // }
-            // printf("\n");
-
-            if (!ggml_allocr_is_measure(allocr)) {
-                ggml_backend_tensor_set(input_ids2, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids2));
-            }
+        input_ids2 = to_backend(input_ids2);
+        if (!return_pooled) {
+            input_ids = to_backend(input_ids);
         }
 
         struct ggml_tensor* embeddings = NULL;
 
         if (num_custom_embeddings > 0 && version != VERSION_XL) {
-            embeddings = ggml_new_tensor_2d(compute_ctx,
-                                            wtype,
-                                            text_model.hidden_size,
-                                            text_model.vocab_size + num_custom_embeddings /* custom placeholder */);
-            ggml_allocr_alloc(allocr, embeddings);
-            if (!ggml_allocr_is_measure(allocr)) {
-                // really bad, there is memory inflexibility (this is for host<->device memory conflicts)
-                auto token_embed_weight = text_model.get_token_embed_weight();
-                void* freeze_data       = malloc(ggml_nbytes(token_embed_weight));
-                ggml_backend_tensor_get_and_sync(backend,
-                                                 token_embed_weight,
-                                                 freeze_data,
-                                                 0,
-                                                 ggml_nbytes(token_embed_weight));
-                ggml_backend_tensor_set(embeddings, freeze_data, 0, ggml_nbytes(token_embed_weight));
-                free(freeze_data);
-                // concatenate custom embeddings
-                ggml_backend_tensor_set(embeddings,
-                                        (const void*)token_embed_custom.data(),
-                                        ggml_nbytes(token_embed_weight),
-                                        num_custom_embeddings * text_model.hidden_size * ggml_type_size(wtype));
-            }
+            auto custom_embeddings = ggml_new_tensor_3d(compute_ctx,
+                                                        wtype,
+                                                        text_model.hidden_size,
+                                                        1,
+                                                        num_custom_embeddings);
+            set_backend_tensor_data(custom_embeddings, token_embed_custom.data());
+
+            auto token_embed_weight = text_model.get_token_embed_weight();
+            token_embed_weight      = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
+            // concatenate custom embeddings
+            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings);
+            embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
         }
 
         struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, input_ids2, embeddings, max_token_idx, return_pooled);
@@ -1028,12 +996,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
     }
 
     void compute(const int n_threads,
-                 std::vector<int> tokens,
+                 struct ggml_tensor* input_ids,
+                 struct ggml_tensor* input_ids2,
+                 size_t max_token_idx,
                  bool return_pooled,
                  ggml_tensor** output,
                  ggml_context* output_ctx = NULL) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(compute_allocr, tokens, return_pooled);
+            return build_graph(input_ids, input_ids2, max_token_idx, return_pooled);
         };
         GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
     }
@@ -1147,8 +1117,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLModule {
         vision_model.get_param_tensors(tensors, prefix + "transformer.visual_model");
     }
 
-    struct ggml_cgraph* build_graph(struct ggml_allocr* allocr,
-                                    struct ggml_tensor* pixel_values) {
+    struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values) {
         struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
 
         pixel_values = to_backend(pixel_values);
@@ -1160,19 +1129,12 @@ struct FrozenCLIPVisionEmbedder : public GGMLModule {
         return gf;
     }
 
-    void alloc_compute_buffer(ggml_context* work_ctx, ggml_tensor* pixel_values) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(compute_allocr, pixel_values);
-        };
-        GGMLModule::alloc_compute_buffer(get_graph);
-    }
-
     void compute(const int n_threads,
                  ggml_tensor* pixel_values,
                  ggml_tensor** output,
                  ggml_context* output_ctx) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(compute_allocr, pixel_values);
+            return build_graph(pixel_values);
         };
         GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
     }
 
@@ -166,7 +166,6 @@ class ControlNetBlock : public GGMLBlock {
 
     struct ggml_tensor* resblock_forward(std::string name,
                                          struct ggml_context* ctx,
-                                         struct ggml_allocr* allocr,
                                          struct ggml_tensor* x,
                                          struct ggml_tensor* emb) {
         auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
@@ -175,7 +174,6 @@ class ControlNetBlock : public GGMLBlock {
 
     struct ggml_tensor* attention_layer_forward(std::string name,
                                                 struct ggml_context* ctx,
-                                                struct ggml_allocr* allocr,
                                                 struct ggml_tensor* x,
                                                 struct ggml_tensor* context) {
         auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
@@ -201,11 +199,10 @@ class ControlNetBlock : public GGMLBlock {
     }
 
     std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
-                                             struct ggml_allocr* allocr,
                                              struct ggml_tensor* x,
                                              struct ggml_tensor* hint,
                                              struct ggml_tensor* guided_hint,
-                                             std::vector<float> timesteps,
+                                             struct ggml_tensor* timesteps,
                                              struct ggml_tensor* context,
                                              struct ggml_tensor* y = NULL) {
         // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
@@ -231,7 +228,7 @@ class ControlNetBlock : public GGMLBlock {
 
         auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
 
-        auto t_emb = new_timestep_embedding(ctx, allocr, timesteps, model_channels);  // [N, model_channels]
+        auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
 
         auto emb = time_embed_0->forward(ctx, t_emb);
         emb      = ggml_silu_inplace(ctx, emb);
@@ -272,10 +269,10 @@ class ControlNetBlock : public GGMLBlock {
             for (int j = 0; j < num_res_blocks; j++) {
                 input_block_idx += 1;
                 std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
-                h                = resblock_forward(name, ctx, allocr, h, emb);  // [N, mult*model_channels, h, w]
+                h                = resblock_forward(name, ctx, h, emb);  // [N, mult*model_channels, h, w]
                 if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                     std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
-                    h                = attention_layer_forward(name, ctx, allocr, h, context);  // [N, mult*model_channels, h, w]
+                    h                = attention_layer_forward(name, ctx, h, context);  // [N, mult*model_channels, h, w]
                 }
 
                 auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
@@ -299,9 +296,9 @@ class ControlNetBlock : public GGMLBlock {
         // [N, 4*model_channels, h/8, w/8]
 
         // middle_block
-        h = resblock_forward("middle_block.0", ctx, allocr, h, emb);             // [N, 4*model_channels, h/8, w/8]
-        h = attention_layer_forward("middle_block.1", ctx, allocr, h, context);  // [N, 4*model_channels, h/8, w/8]
-        h = resblock_forward("middle_block.2", ctx, allocr, h, emb);             // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.0", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
+        h = attention_layer_forward("middle_block.1", ctx, h, context);  // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.2", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
 
         // out
         outs.push_back(middle_block_out->forward(ctx, h));
@@ -386,18 +383,22 @@ struct ControlNet : public GGMLModule {
 
     struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                     struct ggml_tensor* hint,
-                                    std::vector<float> timesteps,
+                                    struct ggml_tensor* timesteps,
                                     struct ggml_tensor* context,
                                     struct ggml_tensor* y = NULL) {
         struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
 
-        x       = to_backend(x);
-        hint    = to_backend(hint);
-        context = to_backend(context);
-        y       = to_backend(y);
+        x = to_backend(x);
+        if (guided_hint_cached) {
+            hint = NULL;
+        } else {
+            hint = to_backend(hint);
+        }
+        context   = to_backend(context);
+        y         = to_backend(y);
+        timesteps = to_backend(timesteps);
 
         auto outs = control_net.forward(compute_ctx,
-                                        compute_allocr,
                                         x,
                                         hint,
                                         guided_hint_cached ? guided_hint : NULL,
@@ -420,7 +421,7 @@ struct ControlNet : public GGMLModule {
     void compute(int n_threads,
                  struct ggml_tensor* x,
                  struct ggml_tensor* hint,
-                 std::vector<float> timesteps,
+                 struct ggml_tensor* timesteps,
                  struct ggml_tensor* context,
                  struct ggml_tensor* y,
                  struct ggml_tensor** output     = NULL,
@@ -434,7 +435,6 @@ struct ControlNet : public GGMLModule {
         };
 
         GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
-
         guided_hint_cached = true;
     }
 
 
@@ -96,6 +96,7 @@ struct SDParams {
     bool vae_tiling               = false;
     bool control_net_cpu          = false;
     bool canny_preprocess         = false;
+    int upscale_repeats           = 1;
 };
 
 void print_params(SDParams params) {
@@ -129,6 +130,7 @@ void print_params(SDParams params) {
     printf("    seed:              %ld\n", params.seed);
     printf("    batch_count:       %d\n", params.batch_count);
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
+    printf("    upscale_repeats:   %d\n", params.upscale_repeats);
 }
 
 void print_usage(int argc, const char* argv[]) {
@@ -145,6 +147,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --control-net [CONTROL_PATH]       path to control net model\n");
     printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings.\n");
     printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n");
+    printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
     printf("  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
     printf("                                     If not specified, the default is the type of the weight file.\n");
     printf("  --lora-model-dir [DIR]             lora model directory\n");
@@ -296,6 +299,16 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.prompt = argv[i];
+        } else if (arg == "--upscale-repeats") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.upscale_repeats = std::stoi(argv[i]);
+            if (params.upscale_repeats < 1) {
+                fprintf(stderr, "error: upscale multiplier must be at least 1\n");
+                exit(1);
+            }
         } else if (arg == "-n" || arg == "--negative-prompt") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -486,6 +499,18 @@ void parse_args(int argc, const char** argv, SDParams& params) {
     }
 }
 
+static std::string sd_basename(const std::string& path) {
+    size_t pos = path.find_last_of('/');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    pos = path.find_last_of('\\');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    return path;
+}
+
 std::string get_image_params(SDParams params, int64_t seed) {
     std::string parameter_string = params.prompt + "\n";
     if (params.negative_prompt.size() != 0) {
@@ -618,7 +643,14 @@ int main(int argc, const char* argv[]) {
                                            input_image_buffer};
             if (params.canny_preprocess) {  // apply preprocessor
                 LOG_INFO("Applying canny preprocessor");
-                control_image->data = preprocess_canny(control_image->data, control_image->width, control_image->height);
+                control_image->data = preprocess_canny(control_image->data,
+                                                       control_image->width,
+                                                       control_image->height,
+                                                       0.08f,
+                                                       0.08f,
+                                                       0.8f,
+                                                       1.0f,
+                                                       false);
             }
         }
         results = txt2img(sd_ctx,
@@ -700,7 +732,7 @@ int main(int argc, const char* argv[]) {
     }
 
     int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
-    if (params.esrgan_path.size() > 0) {
+    if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
         upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
                                                         params.n_threads,
                                                         params.wtype);
@@ -712,13 +744,17 @@ int main(int argc, const char* argv[]) {
                 if (results[i].data == NULL) {
                     continue;
                 }
-                sd_image_t upscaled_image = upscale(upscaler_ctx, results[i], upscale_factor);
-                if (upscaled_image.data == NULL) {
-                    printf("upscale failed\n");
-                    continue;
+                sd_image_t current_image = results[i];
+                for (int u = 0; u < params.upscale_repeats; ++u) {
+                    sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor);
+                    if (upscaled_image.data == NULL) {
+                        printf("upscale failed\n");
+                        break;
+                    }
+                    free(current_image.data);
+                    current_image = upscaled_image;
                 }
-                free(results[i].data);
-                results[i] = upscaled_image;
+                results[i] = current_image;  // Set the final upscaled image as the result
             }
         }
     }