leejet · FSSRepo · Apr 7, 2024 · Apr 20, 2024 · Apr 21, 2024 · May 1, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "ggml"]
-    path = ggml
-	url = https://github.com/ggerganov/ggml.git
+	path = ggml
+	url = https://github.com/FSSRepo/ggml.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -24,18 +24,18 @@ endif()
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
-option(SD_CUBLAS                     "sd: cuda backend" OFF)
+option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
 option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
-option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
+option(SD_CONV2D_MEMORY_EFFICIENT    "sd: conv2d memory efficient (vae stage less memory usage)" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
 
-if(SD_CUBLAS)
-	message("Use CUBLAS as backend stable-diffusion")
-    set(GGML_CUBLAS ON)
-    add_definitions(-DSD_USE_CUBLAS)
+if(SD_CUDA)
+	message("Use CUDA as backend stable-diffusion")
+    set(GGML_CUDA ON)
+    add_definitions(-DSD_USE_CUDA)
 endif()
 
 if(SD_METAL)
@@ -47,15 +47,21 @@ endif()
 if (SD_HIPBLAS)
     message("Use HIPBLAS as backend stable-diffusion")
     set(GGML_HIPBLAS ON)
-    add_definitions(-DSD_USE_CUBLAS)
-    if(SD_FAST_SOFTMAX)
-        set(GGML_CUDA_FAST_SOFTMAX ON)
-    endif()
+    add_definitions(-DSD_USE_CUDA)
 endif ()
 
 if(SD_FLASH_ATTN)
-    message("Use Flash Attention for memory optimization")
-    add_definitions(-DSD_USE_FLASH_ATTENTION)
+    if(SD_HIPBLAS)
+    message("FA don't supported by HIPBLAS backend")
+    else()
+        message("Use Flash Attention for memory optimization")
+        add_definitions(-DSD_USE_FLASH_ATTENTION)
+    endif()
+endif()
+
+if(SD_CONV2D_MEMORY_EFFICIENT)
+    message("Use a fused conv2d kernel for memory optimization")
+    set(GGML_CONV2D_FUSED ON)
 endif()
 
 set(SD_LIB stable-diffusion)

diff --git a/README.md b/README.md
@@ -111,12 +111,12 @@ cmake .. -DGGML_OPENBLAS=ON
 cmake --build . --config Release
 ```
 
-##### Using CUBLAS
+##### Using CUDA
 
 This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
 
 ```
-cmake .. -DSD_CUBLAS=ON
+cmake .. -DSD_CUDA=ON
 cmake --build . --config Release
 ```
 

diff --git a/clip.hpp b/clip.hpp
@@ -172,9 +172,9 @@ class CLIPTokenizer {
 
         auto it = encoder.find(utf8_to_utf32("img</w>"));
         if (it != encoder.end()) {
-            LOG_DEBUG(" trigger word img already in vocab");
+            LOG_DEBUG("trigger word img already in vocab");
         } else {
-            LOG_DEBUG(" trigger word img not in vocab yet");
+            LOG_DEBUG("trigger word img not in vocab yet");
         }
 
         int rank = 0;

diff --git a/common.hpp b/common.hpp
@@ -279,24 +279,63 @@ class CrossAttention : public GGMLBlock {
         int64_t n_context = context->ne[1];
         int64_t inner_dim = d_head * n_head;
 
+#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
+        bool apply_flash = n_context % 256 == 0 && d_head == 40;
+#endif
+
         auto q = to_q->forward(ctx, x);                                 // [N, n_token, inner_dim]
         q      = ggml_reshape_4d(ctx, q, d_head, n_head, n_token, n);   // [N, n_token, n_head, d_head]
         q      = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));      // [N, n_head, n_token, d_head]
         q      = ggml_reshape_3d(ctx, q, d_head, n_token, n_head * n);  // [N * n_head, n_token, d_head]
 
+#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
+        if(apply_flash) {
+            q = ggml_pad(ctx, q, d_head == 40 ? 8 : 0, 0, 0, 0);
+        }
+#endif
+
         auto k = to_k->forward(ctx, context);                             // [N, n_context, inner_dim]
         k      = ggml_reshape_4d(ctx, k, d_head, n_head, n_context, n);   // [N, n_context, n_head, d_head]
         k      = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));        // [N, n_head, n_context, d_head]
         k      = ggml_reshape_3d(ctx, k, d_head, n_context, n_head * n);  // [N * n_head, n_context, d_head]
 
+#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
+        if(apply_flash) {
+            k = ggml_cast(ctx, ggml_pad(ctx, k, d_head == 40 ? 8 : 0, 0, 0, 0), GGML_TYPE_F16);
+        }
+#endif
+
         auto v = to_v->forward(ctx, context);                             // [N, n_context, inner_dim]
         v      = ggml_reshape_4d(ctx, v, d_head, n_head, n_context, n);   // [N, n_context, n_head, d_head]
-        v      = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));        // [N, n_head, d_head, n_context]
-        v      = ggml_reshape_3d(ctx, v, n_context, d_head, n_head * n);  // [N * n_head, d_head, n_context]
 
-        auto kqv = ggml_nn_attention(ctx, q, k, v, false);  // [N * n_head, n_token, d_head]
-        kqv      = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, n);
-        kqv      = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_head]
+#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
+        if(apply_flash) {
+            v   = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3));      // [N, n_head, n_token, d_head]
+            v   = ggml_reshape_3d(ctx, v, d_head, n_token, n_head * n);  // [N * n_head, n_token, d_head]
+            v   = ggml_cast(ctx, ggml_pad(ctx, v, d_head == 40 ? 8 : 0, 0, 0, 0), GGML_TYPE_F16);
+        } else {
+#endif
+            v      = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));        // [N, n_head, d_head, n_context]
+            v      = ggml_reshape_3d(ctx, v, n_context, d_head, n_head * n);  // [N * n_head, d_head, n_context]
+#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
+        }
+#endif
+
+        struct ggml_tensor* kqv = nullptr;
+#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
+        if(!apply_flash) {
+#endif
+            kqv      = ggml_nn_attention(ctx, q, k, v, false);  // [N * n_head, n_token, d_head]
+            kqv      = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, n);
+            kqv      = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_head]
+#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
+        } else {
+            kqv      = ggml_flash_attn_ext(ctx, q, k, v, nullptr, 1.f / sqrtf(d_head));
+            ggml_flash_attn_ext_set_prec(kqv, GGML_PREC_F32);
+            kqv      = ggml_view_3d(ctx, kqv, d_head, n_head, n_token, kqv->nb[1], kqv->nb[2], 0);
+            kqv      = ggml_cont(ctx, kqv);
+        }
+#endif
 
         x = ggml_reshape_3d(ctx, kqv, d_head * n_head, n_token, n);  // [N, n_token, inner_dim]
 

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -1,3 +1,4 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
-add_subdirectory(cli)
+add_subdirectory(cli)
+add_subdirectory(server)
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -721,7 +721,6 @@ int main(int argc, const char* argv[]) {
                                   params.embeddings_path.c_str(),
                                   params.stacked_id_embeddings_path.c_str(),
                                   vae_decode_only,
-                                  params.vae_tiling,
                                   true,
                                   params.n_threads,
                                   params.wtype,
@@ -747,6 +746,7 @@ int main(int argc, const char* argv[]) {
         control_image = new sd_image_t{(uint32_t)params.width,
                                        (uint32_t)params.height,
                                        3,
+                                           -1,
                                        control_image_buffer};
         if (params.canny_preprocess) {  // apply preprocessor
             control_image->data = preprocess_canny(control_image->data,
@@ -777,11 +777,13 @@ int main(int argc, const char* argv[]) {
                           params.control_strength,
                           params.style_ratio,
                           params.normalize_input,
-                          params.input_id_images_path.c_str());
+                          params.input_id_images_path.c_str(),
+                          params.vae_tiling);
     } else {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
                                   3,
+                                  -1,
                                   input_image_buffer};
 
         if (params.mode == IMG2VID) {

diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET server)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(${TARGET} server.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
+if (WIN32)
+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+endif()
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/server/deps.sh b/examples/server/deps.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Download and update deps for binary
+
+# get the directory of this script file
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+PUBLIC=$DIR/public
+
+echo "download js bundle files"
+curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/preact.js
+echo >> $PUBLIC/preact.js # add newline
+
+FILES=$(ls $PUBLIC)
+
+cd $PUBLIC
+for FILE in $FILES; do
+  echo "generate $FILE.hpp"
+
+  # use simple flag for old version of xxd
+  xxd -i $FILE > $DIR/$FILE.hpp
+done
diff --git a/examples/server/index.html.hpp b/examples/server/index.html.hpp
@@ -0,0 +1,32 @@
+unsigned char index_html[] = {
+  0x3c, 0x68, 0x74, 0x6d, 0x6c, 0x3e, 0x0a, 0x3c, 0x68, 0x65, 0x61, 0x64,
+  0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20,
+  0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x22, 0x55, 0x54, 0x46,
+  0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x65,
+  0x74, 0x61, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x76, 0x69, 0x65,
+  0x77, 0x70, 0x6f, 0x72, 0x74, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
+  0x6e, 0x74, 0x3d, 0x22, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3d, 0x64, 0x65,
+  0x76, 0x69, 0x63, 0x65, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x2c, 0x20,
+  0x69, 0x6e, 0x69, 0x74, 0x69, 0x61, 0x6c, 0x2d, 0x73, 0x63, 0x61, 0x6c,
+  0x65, 0x3d, 0x31, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d,
+  0x2d, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x3d, 0x31, 0x22, 0x20, 0x2f, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e,
+  0x53, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x44, 0x69, 0x66, 0x66, 0x75,
+  0x73, 0x69, 0x6f, 0x6e, 0x20, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3c,
+  0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x6c, 0x69, 0x6e, 0x6b, 0x20, 0x72, 0x65, 0x6c, 0x3d, 0x22, 0x73,
+  0x74, 0x79, 0x6c, 0x65, 0x73, 0x68, 0x65, 0x65, 0x74, 0x22, 0x20, 0x68,
+  0x72, 0x65, 0x66, 0x3d, 0x22, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x73, 0x2e,
+  0x63, 0x73, 0x73, 0x22, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x3c, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65,
+  0x3d, 0x22, 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, 0x20, 0x73, 0x72,
+  0x63, 0x3d, 0x22, 0x6d, 0x61, 0x69, 0x6e, 0x2e, 0x6a, 0x73, 0x22, 0x3e,
+  0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f,
+  0x68, 0x65, 0x61, 0x64, 0x3e, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69, 0x64,
+  0x3d, 0x22, 0x61, 0x70, 0x70, 0x2d, 0x76, 0x69, 0x65, 0x77, 0x70, 0x6f,
+  0x72, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64,
+  0x69, 0x76, 0x3e, 0x0a, 0x3c, 0x2f, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a,
+  0x3c, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3e
+};
+unsigned int index_html_len = 343;