Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP - Web server + conv2d fused + k-quants + dynamic gpu offloading #221

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "ggml"]
path = ggml
url = https://github.com/ggerganov/ggml.git
path = ggml
url = https://github.com/FSSRepo/ggml.git
30 changes: 18 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,18 @@ endif()
# general
#option(SD_BUILD_TESTS "sd: build tests" ${SD_STANDALONE})
option(SD_BUILD_EXAMPLES "sd: build examples" ${SD_STANDALONE})
option(SD_CUBLAS "sd: cuda backend" OFF)
option(SD_CUDA "sd: cuda backend" OFF)
option(SD_HIPBLAS "sd: rocm backend" OFF)
option(SD_METAL "sd: metal backend" OFF)
option(SD_FLASH_ATTN "sd: use flash attention for x4 less memory usage" OFF)
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_CONV2D_MEMORY_EFFICIENT "sd: conv2d memory efficient (vae stage less memory usage)" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON)

if(SD_CUBLAS)
message("Use CUBLAS as backend stable-diffusion")
set(GGML_CUBLAS ON)
add_definitions(-DSD_USE_CUBLAS)
if(SD_CUDA)
message("Use CUDA as backend stable-diffusion")
set(GGML_CUDA ON)
add_definitions(-DSD_USE_CUDA)
endif()

if(SD_METAL)
Expand All @@ -47,15 +47,21 @@ endif()
if (SD_HIPBLAS)
message("Use HIPBLAS as backend stable-diffusion")
set(GGML_HIPBLAS ON)
add_definitions(-DSD_USE_CUBLAS)
if(SD_FAST_SOFTMAX)
set(GGML_CUDA_FAST_SOFTMAX ON)
endif()
add_definitions(-DSD_USE_CUDA)
endif ()

if(SD_FLASH_ATTN)
message("Use Flash Attention for memory optimization")
add_definitions(-DSD_USE_FLASH_ATTENTION)
if(SD_HIPBLAS)
message("FA don't supported by HIPBLAS backend")
else()
message("Use Flash Attention for memory optimization")
add_definitions(-DSD_USE_FLASH_ATTENTION)
endif()
endif()

if(SD_CONV2D_MEMORY_EFFICIENT)
message("Use a fused conv2d kernel for memory optimization")
set(GGML_CONV2D_FUSED ON)
endif()

set(SD_LIB stable-diffusion)
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,12 @@ cmake .. -DGGML_OPENBLAS=ON
cmake --build . --config Release
```

##### Using CUBLAS
##### Using CUDA

This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.

```
cmake .. -DSD_CUBLAS=ON
cmake .. -DSD_CUDA=ON
cmake --build . --config Release
```

Expand Down
4 changes: 2 additions & 2 deletions clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,9 @@ class CLIPTokenizer {

auto it = encoder.find(utf8_to_utf32("img</w>"));
if (it != encoder.end()) {
LOG_DEBUG(" trigger word img already in vocab");
LOG_DEBUG("trigger word img already in vocab");
} else {
LOG_DEBUG(" trigger word img not in vocab yet");
LOG_DEBUG("trigger word img not in vocab yet");
}

int rank = 0;
Expand Down
49 changes: 44 additions & 5 deletions common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,24 +279,63 @@ class CrossAttention : public GGMLBlock {
int64_t n_context = context->ne[1];
int64_t inner_dim = d_head * n_head;

#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
bool apply_flash = n_context % 256 == 0 && d_head == 40;
#endif

auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
q = ggml_reshape_4d(ctx, q, d_head, n_head, n_token, n); // [N, n_token, n_head, d_head]
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, n_token, d_head]
q = ggml_reshape_3d(ctx, q, d_head, n_token, n_head * n); // [N * n_head, n_token, d_head]

#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
if(apply_flash) {
q = ggml_pad(ctx, q, d_head == 40 ? 8 : 0, 0, 0, 0);
}
#endif

auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
k = ggml_reshape_4d(ctx, k, d_head, n_head, n_context, n); // [N, n_context, n_head, d_head]
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, n_context, d_head]
k = ggml_reshape_3d(ctx, k, d_head, n_context, n_head * n); // [N * n_head, n_context, d_head]

#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
if(apply_flash) {
k = ggml_cast(ctx, ggml_pad(ctx, k, d_head == 40 ? 8 : 0, 0, 0, 0), GGML_TYPE_F16);
}
#endif

auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
v = ggml_reshape_4d(ctx, v, d_head, n_head, n_context, n); // [N, n_context, n_head, d_head]
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, n_context]
v = ggml_reshape_3d(ctx, v, n_context, d_head, n_head * n); // [N * n_head, d_head, n_context]

auto kqv = ggml_nn_attention(ctx, q, k, v, false); // [N * n_head, n_token, d_head]
kqv = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, n);
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, n_token, n_head, d_head]
#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
if(apply_flash) {
v = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3)); // [N, n_head, n_token, d_head]
v = ggml_reshape_3d(ctx, v, d_head, n_token, n_head * n); // [N * n_head, n_token, d_head]
v = ggml_cast(ctx, ggml_pad(ctx, v, d_head == 40 ? 8 : 0, 0, 0, 0), GGML_TYPE_F16);
} else {
#endif
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, n_context]
v = ggml_reshape_3d(ctx, v, n_context, d_head, n_head * n); // [N * n_head, d_head, n_context]
#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
}
#endif

struct ggml_tensor* kqv = nullptr;
#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
if(!apply_flash) {
#endif
kqv = ggml_nn_attention(ctx, q, k, v, false); // [N * n_head, n_token, d_head]
kqv = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, n);
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, n_token, n_head, d_head]
#if defined(SD_USE_FLASH_ATTENTION) && defined(SD_USE_CUDA)
} else {
kqv = ggml_flash_attn_ext(ctx, q, k, v, nullptr, 1.f / sqrtf(d_head));
ggml_flash_attn_ext_set_prec(kqv, GGML_PREC_F32);
kqv = ggml_view_3d(ctx, kqv, d_head, n_head, n_token, kqv->nb[1], kqv->nb[2], 0);
kqv = ggml_cont(ctx, kqv);
}
#endif

x = ggml_reshape_3d(ctx, kqv, d_head * n_head, n_token, n); // [N, n_token, inner_dim]

Expand Down
3 changes: 2 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
include_directories(${CMAKE_CURRENT_SOURCE_DIR})

add_subdirectory(cli)
add_subdirectory(cli)
add_subdirectory(server)
6 changes: 4 additions & 2 deletions examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,6 @@ int main(int argc, const char* argv[]) {
params.embeddings_path.c_str(),
params.stacked_id_embeddings_path.c_str(),
vae_decode_only,
params.vae_tiling,
true,
params.n_threads,
params.wtype,
Expand All @@ -747,6 +746,7 @@ int main(int argc, const char* argv[]) {
control_image = new sd_image_t{(uint32_t)params.width,
(uint32_t)params.height,
3,
-1,
control_image_buffer};
if (params.canny_preprocess) { // apply preprocessor
control_image->data = preprocess_canny(control_image->data,
Expand Down Expand Up @@ -777,11 +777,13 @@ int main(int argc, const char* argv[]) {
params.control_strength,
params.style_ratio,
params.normalize_input,
params.input_id_images_path.c_str());
params.input_id_images_path.c_str(),
params.vae_tiling);
} else {
sd_image_t input_image = {(uint32_t)params.width,
(uint32_t)params.height,
3,
-1,
input_image_buffer};

if (params.mode == IMG2VID) {
Expand Down
9 changes: 9 additions & 0 deletions examples/server/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
set(TARGET server)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)
20 changes: 20 additions & 0 deletions examples/server/deps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
# Download and update deps for binary

# get the directory of this script file
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
PUBLIC=$DIR/public

echo "download js bundle files"
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/preact.js
echo >> $PUBLIC/preact.js # add newline

FILES=$(ls $PUBLIC)

cd $PUBLIC
for FILE in $FILES; do
echo "generate $FILE.hpp"

# use simple flag for old version of xxd
xxd -i $FILE > $DIR/$FILE.hpp
done
32 changes: 32 additions & 0 deletions examples/server/index.html.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
unsigned char index_html[] = {
0x3c, 0x68, 0x74, 0x6d, 0x6c, 0x3e, 0x0a, 0x3c, 0x68, 0x65, 0x61, 0x64,
0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20,
0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x22, 0x55, 0x54, 0x46,
0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x65,
0x74, 0x61, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x76, 0x69, 0x65,
0x77, 0x70, 0x6f, 0x72, 0x74, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
0x6e, 0x74, 0x3d, 0x22, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3d, 0x64, 0x65,
0x76, 0x69, 0x63, 0x65, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x2c, 0x20,
0x69, 0x6e, 0x69, 0x74, 0x69, 0x61, 0x6c, 0x2d, 0x73, 0x63, 0x61, 0x6c,
0x65, 0x3d, 0x31, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d,
0x2d, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x3d, 0x31, 0x22, 0x20, 0x2f, 0x3e,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e,
0x53, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x44, 0x69, 0x66, 0x66, 0x75,
0x73, 0x69, 0x6f, 0x6e, 0x20, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3c,
0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x6c, 0x69, 0x6e, 0x6b, 0x20, 0x72, 0x65, 0x6c, 0x3d, 0x22, 0x73,
0x74, 0x79, 0x6c, 0x65, 0x73, 0x68, 0x65, 0x65, 0x74, 0x22, 0x20, 0x68,
0x72, 0x65, 0x66, 0x3d, 0x22, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x73, 0x2e,
0x63, 0x73, 0x73, 0x22, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x3c, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65,
0x3d, 0x22, 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, 0x20, 0x73, 0x72,
0x63, 0x3d, 0x22, 0x6d, 0x61, 0x69, 0x6e, 0x2e, 0x6a, 0x73, 0x22, 0x3e,
0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f,
0x68, 0x65, 0x61, 0x64, 0x3e, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e,
0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69, 0x64,
0x3d, 0x22, 0x61, 0x70, 0x70, 0x2d, 0x76, 0x69, 0x65, 0x77, 0x70, 0x6f,
0x72, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64,
0x69, 0x76, 0x3e, 0x0a, 0x3c, 0x2f, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a,
0x3c, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3e
};
unsigned int index_html_len = 343;
Loading