sync: update ggml

leejet · leejet · commit be6cd1a4bf4f · 2024-06-01T13:44:09.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -34,7 +34,7 @@ option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 
 if(SD_CUBLAS)
 	message("Use CUBLAS as backend stable-diffusion")
-    set(GGML_CUBLAS ON)
+    set(GGML_CUDA ON)
     add_definitions(-DSD_USE_CUBLAS)
 endif()
 
diff --git a/clip.hpp b/clip.hpp
@@ -679,8 +679,8 @@ class CLIPVisionEmbeddings : public GGMLBlock {
         class_embedding                     = ggml_repeat(ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
         class_embedding                     = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]
 
-        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding);    // [N, num_positions, embed_dim, 1]
-        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);  // [N, num_positions, embed_dim]
+        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
+        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
         x                     = ggml_add(ctx, x, position_embed_weight);
         return x;  // [N, num_positions, embed_dim]
     }
@@ -1036,7 +1036,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
                                              hidden_states2->ne[3]);
             hidden_states2 = ggml_cont(ctx, ggml_permute(ctx, hidden_states2, 2, 0, 1, 3));
 
-            hidden_states = ggml_concat(ctx, hidden_states, hidden_states2);  // [N, n_token, hidden_size + hidden_size2]
+            hidden_states = ggml_concat(ctx, hidden_states, hidden_states2, 2);  // [N, n_token, hidden_size + hidden_size2]
 
             hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 1, 2, 0, 3));
         }
@@ -1069,7 +1069,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
             auto token_embed_weight = text_model.get_token_embed_weight();
             token_embed_weight      = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
             // concatenate custom embeddings
-            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings);
+            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 2);
             embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
         }
 
diff --git a/esrgan.hpp b/esrgan.hpp
@@ -42,13 +42,13 @@ class ResidualDenseBlock : public GGMLBlock {
         auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
 
         auto x1    = lrelu(ctx, conv1->forward(ctx, x));
-        auto x_cat = ggml_concat(ctx, x, x1);
+        auto x_cat = ggml_concat(ctx, x, x1, 2);
         auto x2    = lrelu(ctx, conv2->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x2);
+        x_cat      = ggml_concat(ctx, x_cat, x2, 2);
         auto x3    = lrelu(ctx, conv3->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x3);
+        x_cat      = ggml_concat(ctx, x_cat, x3, 2);
         auto x4    = lrelu(ctx, conv4->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x4);
+        x_cat      = ggml_concat(ctx, x_cat, x4, 2);
         auto x5    = conv5->forward(ctx, x_cat);
 
         x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
diff --git a/ggml b/ggml
@@ -1 +1 @@
-Subproject commit 57869ad3b7b1f49ae18e3238b0d69a9467a8f068
+Subproject commit 2aae01fd9b8f9399f343cf18f46f38996ef52e2c
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -917,7 +917,7 @@ struct GGMLModule {
             return NULL;
         }
         // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
+        if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
             // pass input tensors to gpu memory
             auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
 
diff --git a/model.cpp b/model.cpp
@@ -571,10 +571,9 @@ void convert_tensor(void* src,
         if (dst_type == GGML_TYPE_F16) {
             ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
         } else {
-            int64_t hist[16];
             std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
             const float* im = imatrix.data();
-            ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, hist, im);
+            ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im);
         }
     } else if (dst_type == GGML_TYPE_F32) {
         if (src_type == GGML_TYPE_F16) {
@@ -602,10 +601,9 @@ void convert_tensor(void* src,
         if (dst_type == GGML_TYPE_F16) {
             ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
         } else {
-            int64_t hist[16];
             std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
             const float* im = imatrix.data();
-            ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, hist, im);
+            ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im);
         }
     }
 }
diff --git a/pmid.hpp b/pmid.hpp
@@ -64,7 +64,7 @@ struct FuseModule : public GGMLBlock {
         auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
         auto id_embeds0     = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
         // concat is along dim 2
-        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0);
+        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
         stacked_id_embeds      = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));
 
         // stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
@@ -102,12 +102,12 @@ struct FuseModule : public GGMLBlock {
 
         stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
         if (left && right) {
-            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds);
-            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right);
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
         } else if (left) {
-            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds);
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
         } else if (right) {
-            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right);
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
         }
         stacked_id_embeds                         = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
         class_tokens_mask                         = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
@@ -146,7 +146,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
         id_embeds   = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
         id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));
 
-        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
+        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
         id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));
 
         struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -60,12 +60,11 @@ enum sd_type_t {
     SD_TYPE_Q4_0 = 2,
     SD_TYPE_Q4_1 = 3,
     // SD_TYPE_Q4_2 = 4, support has been removed
-    // SD_TYPE_Q4_3 (5) support has been removed
-    SD_TYPE_Q5_0 = 6,
-    SD_TYPE_Q5_1 = 7,
-    SD_TYPE_Q8_0 = 8,
-    SD_TYPE_Q8_1 = 9,
-    // k-quantizations
+    // SD_TYPE_Q4_3 = 5, support has been removed
+    SD_TYPE_Q5_0    = 6,
+    SD_TYPE_Q5_1    = 7,
+    SD_TYPE_Q8_0    = 8,
+    SD_TYPE_Q8_1    = 9,
     SD_TYPE_Q2_K    = 10,
     SD_TYPE_Q3_K    = 11,
     SD_TYPE_Q4_K    = 12,
@@ -80,9 +79,13 @@ enum sd_type_t {
     SD_TYPE_IQ3_S   = 21,
     SD_TYPE_IQ2_S   = 22,
     SD_TYPE_IQ4_XS  = 23,
-    SD_TYPE_I8,
-    SD_TYPE_I16,
-    SD_TYPE_I32,
+    SD_TYPE_I8      = 24,
+    SD_TYPE_I16     = 25,
+    SD_TYPE_I32     = 26,
+    SD_TYPE_I64     = 27,
+    SD_TYPE_F64     = 28,
+    SD_TYPE_IQ1_M   = 29,
+    SD_TYPE_BF16    = 30,
     SD_TYPE_COUNT,
 };
 
diff --git a/unet.hpp b/unet.hpp
@@ -396,7 +396,7 @@ class UnetModelBlock : public GGMLBlock {
             if (c_concat->ne[3] != x->ne[3]) {
                 c_concat = ggml_repeat(ctx, c_concat, x);
             }
-            x = ggml_concat(ctx, x, c_concat);
+            x = ggml_concat(ctx, x, c_concat, 2);
         }
 
         if (y != NULL) {
@@ -491,7 +491,7 @@ class UnetModelBlock : public GGMLBlock {
                     control_offset--;
                 }
 
-                h = ggml_concat(ctx, h, h_skip);
+                h = ggml_concat(ctx, h, h_skip, 2);
 
                 std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
 

Original file line number	Diff line number	Diff line change
`@@ -917,7 +917,7 @@ struct GGMLModule {`
`917`	`917`	`return NULL;`
`918`	`918`	`}`
`919`	`919`	`// it's performing a compute, check if backend isn't cpu`
`920`		`- if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {`
	`920`	`+ if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL \|\| ggml_backend_buffer_is_host(tensor->buffer))) {`
`921`	`921`	`// pass input tensors to gpu memory`
`922`	`922`	`auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);`
`923`	`923`
Original file line number	Diff line number	Diff line change
`@@ -571,10 +571,9 @@ void convert_tensor(void* src,`
`571`	`571`	`if (dst_type == GGML_TYPE_F16) {`
`572`	`572`	`ggml_fp32_to_fp16_row((float)src, (ggml_fp16_t)dst, n);`
`573`	`573`	`} else {`
`574`		`- int64_t hist[16];`
`575`	`574`	`std::vector<float> imatrix(n_per_row, 1.0f); // dummy importance matrix`
`576`	`575`	`const float* im = imatrix.data();`
`577`		`- ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, hist, im);`
	`576`	`+ ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im);`
`578`	`577`	`}`
`579`	`578`	`} else if (dst_type == GGML_TYPE_F32) {`
`580`	`579`	`if (src_type == GGML_TYPE_F16) {`
`@@ -602,10 +601,9 @@ void convert_tensor(void* src,`
`602`	`601`	`if (dst_type == GGML_TYPE_F16) {`
`603`	`602`	`ggml_fp32_to_fp16_row((float)src_data_f32, (ggml_fp16_t)dst, n);`
`604`	`603`	`} else {`
`605`		`- int64_t hist[16];`
`606`	`604`	`std::vector<float> imatrix(n_per_row, 1.0f); // dummy importance matrix`
`607`	`605`	`const float* im = imatrix.data();`
`608`		`- ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, hist, im);`
	`606`	`+ ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im);`
`609`	`607`	`}`
`610`	`608`	`}`
`611`	`609`	`}`
Original file line number	Diff line number	Diff line change
`@@ -396,7 +396,7 @@ class UnetModelBlock : public GGMLBlock {`
`396`	`396`	`if (c_concat->ne[3] != x->ne[3]) {`
`397`	`397`	`c_concat = ggml_repeat(ctx, c_concat, x);`
`398`	`398`	`}`
`399`		`- x = ggml_concat(ctx, x, c_concat);`
	`399`	`+ x = ggml_concat(ctx, x, c_concat, 2);`
`400`	`400`	`}`
`401`	`401`
`402`	`402`	`if (y != NULL) {`
`@@ -491,7 +491,7 @@ class UnetModelBlock : public GGMLBlock {`
`491`	`491`	`control_offset--;`
`492`	`492`	`}`
`493`	`493`
`494`		`- h = ggml_concat(ctx, h, h_skip);`
	`494`	`+ h = ggml_concat(ctx, h, h_skip, 2);`
`495`	`495`
`496`	`496`	`std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";`
`497`	`497`