add --clip-on-cpu

leejet · leejet · commit 6727d1cc8fee · 2024-03-10T16:35:03.000+08:00
diff --git a/clip.hpp b/clip.hpp
@@ -567,7 +567,7 @@ struct CLIPEncoder : public GGMLBlock {
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, int clip_skip = -1, bool mask = true) {
         // x: [N, n_token, d_model]
         int layer_idx = n_layer - 1;
-        LOG_DEBUG("clip_skip %d", clip_skip);
+        // LOG_DEBUG("clip_skip %d", clip_skip);
         if (clip_skip > 0) {
             layer_idx = n_layer - clip_skip;
         }
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -100,6 +100,7 @@ struct SDParams {
     bool vae_tiling               = false;
     bool control_net_cpu          = false;
     bool normalize_input          = false;
+    bool clip_on_cpu              = false;
     bool vae_on_cpu               = false;
     bool canny_preprocess         = false;
     int upscale_repeats           = 1;
@@ -123,6 +124,7 @@ void print_params(SDParams params) {
     printf("    output_path:       %s\n", params.output_path.c_str());
     printf("    init_img:          %s\n", params.input_path.c_str());
     printf("    control_image:     %s\n", params.control_image_path.c_str());
+    printf("    clip on cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
     printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
     printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
     printf("    strength(control): %.2f\n", params.control_strength);
@@ -396,6 +398,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             params.control_net_cpu = true;
         } else if (arg == "--normalize-input") {
             params.normalize_input = true;
+        } else if (arg == "--clip-on-cpu") {
+            params.clip_on_cpu = true;  // will slow down get_learned_condiotion but necessary for low MEM GPUs
         } else if (arg == "--vae-on-cpu") {
             params.vae_on_cpu = true;  // will slow down latent decoding but necessary for low MEM GPUs
         } else if (arg == "--canny") {
@@ -658,6 +662,7 @@ int main(int argc, const char* argv[]) {
                                   params.wtype,
                                   params.rng_type,
                                   params.schedule,
+                                  params.clip_on_cpu,
                                   params.control_net_cpu,
                                   params.vae_on_cpu);
 
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -829,7 +829,10 @@ struct GGMLModule {
 
         // compute the required memory
         size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0);
-        LOG_DEBUG("%s compute buffer size: %.2f MB", get_desc().c_str(), compute_buffer_size / 1024.0 / 1024.0);
+        LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
+                  get_desc().c_str(),
+                  compute_buffer_size / 1024.0 / 1024.0,
+                  ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
         return true;
     }
 
@@ -874,8 +877,11 @@ struct GGMLModule {
             return false;
         }
         size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
-        LOG_DEBUG("%s params backend buffer size = % 6.2f MB (%i tensors)",
-                  get_desc().c_str(), params_buffer_size / (1024.0 * 1024.0), num_tensors);
+        LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
+                  get_desc().c_str(),
+                  params_buffer_size / (1024.0 * 1024.0),
+                  ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
+                  num_tensors);
         return true;
     }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -65,8 +65,11 @@ void calculate_alphas_cumprod(float* alphas_cumprod,
 
 class StableDiffusionGGML {
 public:
-    ggml_backend_t backend    = NULL;  // general backend
-    ggml_type model_data_type = GGML_TYPE_COUNT;
+    ggml_backend_t backend             = NULL;  // general backend
+    ggml_backend_t clip_backend        = NULL;
+    ggml_backend_t control_net_backend = NULL;
+    ggml_backend_t vae_backend         = NULL;
+    ggml_type model_data_type          = GGML_TYPE_COUNT;
 
     SDVersion version;
     bool vae_decode_only         = false;
@@ -120,6 +123,9 @@ class StableDiffusionGGML {
 
     ~StableDiffusionGGML() {
         ggml_backend_free(backend);
+        ggml_backend_free(clip_backend);
+        ggml_backend_free(control_net_backend);
+        ggml_backend_free(vae_backend);
     }
 
     bool load_from_file(const std::string& model_path,
@@ -131,6 +137,7 @@ class StableDiffusionGGML {
                         bool vae_tiling_,
                         ggml_type wtype,
                         schedule_t schedule,
+                        bool clip_on_cpu,
                         bool control_net_cpu,
                         bool vae_on_cpu) {
         use_tiny_autoencoder = taesd_path.size() > 0;
@@ -212,7 +219,12 @@ class StableDiffusionGGML {
             first_stage_model->alloc_params_buffer();
             first_stage_model->get_param_tensors(tensors, "first_stage_model");
         } else {
-            cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(backend, model_data_type, version);
+            clip_backend = backend;
+            if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
+                LOG_INFO("CLIP: Using CPU backend");
+                clip_backend = ggml_backend_cpu_init();
+            }
+            cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_data_type, version);
             cond_stage_model->alloc_params_buffer();
             cond_stage_model->get_param_tensors(tensors, "cond_stage_model.");
 
@@ -228,7 +240,6 @@ class StableDiffusionGGML {
             }
 
             if (!use_tiny_autoencoder) {
-                ggml_backend_t vae_backend = NULL;
                 if (vae_on_cpu && !ggml_backend_is_cpu(backend)) {
                     LOG_INFO("VAE Autoencoder: Using CPU backend");
                     vae_backend = ggml_backend_cpu_init();
@@ -244,19 +255,19 @@ class StableDiffusionGGML {
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
             if (control_net_path.size() > 0) {
-                ggml_backend_t cn_backend = NULL;
+                ggml_backend_t controlnet_backend = NULL;
                 if (control_net_cpu && !ggml_backend_is_cpu(backend)) {
                     LOG_DEBUG("ControlNet: Using CPU backend");
-                    cn_backend = ggml_backend_cpu_init();
+                    controlnet_backend = ggml_backend_cpu_init();
                 } else {
-                    cn_backend = backend;
+                    controlnet_backend = backend;
                 }
-                control_net = std::make_shared<ControlNet>(cn_backend, model_data_type, version);
+                control_net = std::make_shared<ControlNet>(controlnet_backend, model_data_type, version);
             }
 
-            pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, GGML_TYPE_F32, version);
+            pmid_model = std::make_shared<PhotoMakerIDEncoder>(clip_backend, model_data_type, version);
             if (id_embeddings_path.size() > 0) {
-                pmid_lora = std::make_shared<LoraModel>(backend, GGML_TYPE_F32, id_embeddings_path, "");
+                pmid_lora = std::make_shared<LoraModel>(backend, model_data_type, id_embeddings_path, "");
                 if (!pmid_lora->load_from_file(true)) {
                     LOG_WARN("load photomaker lora tensors from %s failed", id_embeddings_path.c_str());
                     return false;
@@ -359,15 +370,49 @@ class StableDiffusionGGML {
                 pmid_params_mem_size = pmid_model->get_params_mem_size();
             }
 
-            size_t total_params_size = clip_params_mem_size + clip_params_mem_size +
-                                       clip_params_mem_size + control_net_params_mem_size + pmid_params_mem_size;
-            LOG_INFO("total params memory size = %.2fMB (clip %.2fMB, unet %.2fMB, vae %.2fMB, controlnet %.2fMB, pmid %.2fMB)",
-                     total_params_size / 1024.0 / 1024.0,
-                     clip_params_mem_size / 1024.0 / 1024.0,
-                     unet_params_mem_size / 1024.0 / 1024.0,
-                     vae_params_mem_size / 1024.0 / 1024.0,
-                     control_net_params_mem_size / 1024.0 / 1024.0,
-                     pmid_params_mem_size / 1024.0 / 1024.0);
+            size_t total_params_ram_size  = 0;
+            size_t total_params_vram_size = 0;
+            if (ggml_backend_is_cpu(clip_backend)) {
+                total_params_ram_size += clip_params_mem_size + pmid_params_mem_size;
+            } else {
+                total_params_vram_size += clip_params_mem_size + pmid_params_mem_size;
+            }
+
+            if (ggml_backend_is_cpu(backend)) {
+                total_params_ram_size += unet_params_mem_size;
+            } else {
+                total_params_vram_size += unet_params_mem_size;
+            }
+
+            if (ggml_backend_is_cpu(vae_backend)) {
+                total_params_ram_size += vae_params_mem_size;
+            } else {
+                total_params_vram_size += vae_params_mem_size;
+            }
+
+            if (ggml_backend_is_cpu(control_net_backend)) {
+                total_params_ram_size += control_net_params_mem_size;
+            } else {
+                total_params_vram_size += control_net_params_mem_size;
+            }
+
+            size_t total_params_size = total_params_ram_size + total_params_vram_size;
+            LOG_INFO(
+                "total params memory size = %.2fMB (VRAM %.2fMB, RAM %.2fMB): "
+                "clip %.2fMB(%s), unet %.2fMB(%s), vae %.2fMB(%s), controlnet %.2fMB(%s), pmid %.2fMB(%s)",
+                total_params_size / 1024.0 / 1024.0,
+                total_params_vram_size / 1024.0 / 1024.0,
+                total_params_ram_size / 1024.0 / 1024.0,
+                clip_params_mem_size / 1024.0 / 1024.0,
+                ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM",
+                unet_params_mem_size / 1024.0 / 1024.0,
+                ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
+                vae_params_mem_size / 1024.0 / 1024.0,
+                ggml_backend_is_cpu(vae_backend) ? "RAM" : "VRAM",
+                control_net_params_mem_size / 1024.0 / 1024.0,
+                ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM",
+                pmid_params_mem_size / 1024.0 / 1024.0,
+                ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
         }
 
         int64_t t1 = ggml_time_ms();
@@ -1435,6 +1480,7 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
                      enum sd_type_t wtype,
                      enum rng_type_t rng_type,
                      enum schedule_t s,
+                     bool keep_clip_on_cpu,
                      bool keep_control_net_cpu,
                      bool keep_vae_on_cpu) {
     sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
@@ -1467,6 +1513,7 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
                                     vae_tiling,
                                     (ggml_type)wtype,
                                     s,
+                                    keep_clip_on_cpu,
                                     keep_control_net_cpu,
                                     keep_vae_on_cpu)) {
         delete sd_ctx->sd;
@@ -1601,11 +1648,11 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
             int32_t w                              = input_id_images[0]->width;
             int32_t h                              = input_id_images[0]->height;
             int32_t channels                       = input_id_images[0]->channel;
-            int32_t num_input_images               = input_id_images.size();
+            int32_t num_input_images               = (int32_t)input_id_images.size();
             init_img                               = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, w, h, channels, num_input_images);
             // TODO: move these to somewhere else and be user settable
-            float mean[] = {0.48145466, 0.4578275, 0.40821073};
-            float std[]  = {0.26862954, 0.26130258, 0.27577711};
+            float mean[] = {0.48145466f, 0.4578275f, 0.40821073f};
+            float std[]  = {0.26862954f, 0.26130258f, 0.27577711f};
             for (int i = 0; i < num_input_images; i++) {
                 sd_image_t* init_image = input_id_images[i];
                 if (normalize_input)
diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -125,6 +125,7 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
                             enum sd_type_t wtype,
                             enum rng_type_t rng_type,
                             enum schedule_t s,
+                            bool keep_clip_on_cpu,
                             bool keep_control_net_cpu,
                             bool keep_vae_on_cpu);
 

Original file line number	Diff line number	Diff line change
`@@ -567,7 +567,7 @@ struct CLIPEncoder : public GGMLBlock {`
`567`	`567`	`struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, int clip_skip = -1, bool mask = true) {`
`568`	`568`	`// x: [N, n_token, d_model]`
`569`	`569`	`int layer_idx = n_layer - 1;`
`570`		`- LOG_DEBUG("clip_skip %d", clip_skip);`
	`570`	`+ // LOG_DEBUG("clip_skip %d", clip_skip);`
`571`	`571`	`if (clip_skip > 0) {`
`572`	`572`	`layer_idx = n_layer - clip_skip;`
`573`	`573`	`}`