Skip to content

Commit 6bb87cf

Browse files
committed
reuse get_learned_condition
1 parent 745ed8f commit 6bb87cf

File tree

2 files changed

+18
-135
lines changed

2 files changed

+18
-135
lines changed

clip.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -172,9 +172,9 @@ class CLIPTokenizer {
172172

173173
auto it = encoder.find(utf8_to_utf32("img</w>"));
174174
if (it != encoder.end()) {
175-
LOG_DEBUG(" trigger word img already in vocab \n");
175+
LOG_DEBUG(" trigger word img already in vocab");
176176
} else {
177-
LOG_DEBUG(" trigger word img not in vocab yet\n");
177+
LOG_DEBUG(" trigger word img not in vocab yet");
178178
}
179179

180180
int rank = 0;

stable-diffusion.cpp

+16-133
Original file line numberDiff line numberDiff line change
@@ -533,13 +533,11 @@ class StableDiffusionGGML {
533533
int height,
534534
int num_input_imgs,
535535
bool force_zero_embeddings = false) {
536-
cond_stage_model->set_clip_skip(clip_skip);
537536
auto image_tokens = cond_stage_model->convert_token_to_id(trigger_word);
538537
// if(image_tokens.size() == 1){
539538
// printf(" image token id is: %d \n", image_tokens[0]);
540539
// }
541540
GGML_ASSERT(image_tokens.size() == 1);
542-
// auto tokens_and_weights = cond_stage_model.tokenize(text, true);
543541
auto tokens_and_weights = cond_stage_model->tokenize_with_trigger_token(text,
544542
num_input_imgs,
545543
image_tokens[0],
@@ -555,142 +553,14 @@ class StableDiffusionGGML {
555553
// for(int i = 0; i < clsm.size(); ++i)
556554
// printf("%d ", clsm[i]?1:0);
557555
// printf("\n");
558-
int64_t t0 = ggml_time_ms();
559-
struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
560-
struct ggml_tensor* pooled = NULL;
561-
// size_t total_hidden_size = cond_stage_model.text_model.hidden_size;
562-
// if (version == VERSION_XL) {
563-
// total_hidden_size += cond_stage_model.text_model2.hidden_size;
564-
// pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, cond_stage_model.text_model2.projection_dim);
565-
// }
566-
// struct ggml_tensor* hidden_states = ggml_new_tensor_2d(work_ctx,
567-
// GGML_TYPE_F32,
568-
// total_hidden_size,
569-
// cond_stage_model.text_model.max_position_embeddings); // [N, n_token, hidden_size]
570-
// cond_stage_model.alloc_compute_buffer(work_ctx, (int)tokens.size());
571-
// cond_stage_model.compute(n_threads, tokens, hidden_states, pooled);
572-
// cond_stage_model.free_compute_buffer();
573-
574-
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
575-
struct ggml_tensor* input_ids2 = NULL;
576-
size_t max_token_idx = 0;
577-
if (version == VERSION_XL) {
578-
auto it = std::find(tokens.begin(), tokens.end(), EOS_TOKEN_ID);
579-
if (it != tokens.end()) {
580-
std::fill(std::next(it), tokens.end(), 0);
581-
}
582-
583-
max_token_idx = std::min<size_t>(std::distance(tokens.begin(), it), tokens.size() - 1);
584-
585-
input_ids2 = vector_to_ggml_tensor_i32(work_ctx, tokens);
586-
587-
// for (int i = 0; i < tokens.size(); i++) {
588-
// printf("%d ", tokens[i]);
589-
// }
590-
// printf("\n");
591-
}
592-
593-
cond_stage_model->compute(n_threads, input_ids, input_ids2, max_token_idx, false, &hidden_states, work_ctx);
594-
if (version == VERSION_XL) {
595-
cond_stage_model->compute(n_threads, input_ids, input_ids2, max_token_idx, true, &pooled, work_ctx);
596-
}
597-
598-
// cond_stage_model->compute(n_threads, tokens, false, &hidden_states, work_ctx);
599-
// if (version == VERSION_XL) {
600-
// cond_stage_model->compute(n_threads, tokens, true, &pooled, work_ctx);
601-
// }
602-
// if (pooled != NULL) {
603-
// print_ggml_tensor(hidden_states);
604-
// print_ggml_tensor(pooled);
605-
// }
606-
607-
int64_t t1 = ggml_time_ms();
608-
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
609-
ggml_tensor* result = ggml_dup_tensor(work_ctx, hidden_states);
610-
{
611-
float original_mean = ggml_tensor_mean(hidden_states);
612-
for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) {
613-
for (int i1 = 0; i1 < hidden_states->ne[1]; i1++) {
614-
for (int i0 = 0; i0 < hidden_states->ne[0]; i0++) {
615-
float value = ggml_tensor_get_f32(hidden_states, i0, i1, i2);
616-
value *= weights[i1];
617-
ggml_tensor_set_f32(result, value, i0, i1, i2);
618-
}
619-
}
620-
}
621-
float new_mean = ggml_tensor_mean(result);
622-
ggml_tensor_scale(result, (original_mean / new_mean));
623-
}
624-
if (force_zero_embeddings) {
625-
float* vec = (float*)result->data;
626-
for (int i = 0; i < ggml_nelements(result); i++) {
627-
vec[i] = 0;
628-
}
629-
}
630-
631-
ggml_tensor* vec = NULL;
632-
if (version == VERSION_XL) {
633-
int out_dim = 256;
634-
// vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels);
635-
vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->unet.adm_in_channels);
636-
// [0:1280]
637-
size_t offset = 0;
638-
memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
639-
offset += ggml_nbytes(pooled);
640-
641-
// struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2);
642-
// original_size_as_tuple
643-
float orig_width = (float)width;
644-
float orig_height = (float)height;
645-
std::vector<float> timesteps = {orig_height, orig_width};
646-
// ggml_tensor_set_f32(timesteps, orig_height, 0);
647-
// ggml_tensor_set_f32(timesteps, orig_width, 1);
648-
ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
649-
offset += ggml_nbytes(embed_view);
650-
set_timestep_embedding(timesteps, embed_view, out_dim);
651-
// print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
652-
// crop_coords_top_left
653-
float crop_coord_top = 0.f;
654-
float crop_coord_left = 0.f;
655-
timesteps = {crop_coord_top, crop_coord_left};
656-
// ggml_tensor_set_f32(timesteps, crop_coord_top, 0);
657-
// ggml_tensor_set_f32(timesteps, crop_coord_left, 1);
658-
embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
659-
offset += ggml_nbytes(embed_view);
660-
set_timestep_embedding(timesteps, embed_view, out_dim);
661-
// print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
662-
// target_size_as_tuple
663-
float target_width = (float)width;
664-
float target_height = (float)height;
665-
// ggml_tensor_set_f32(timesteps, target_height, 0);
666-
// ggml_tensor_set_f32(timesteps, target_width, 1);
667-
timesteps = {target_height, target_width};
668-
embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
669-
offset += ggml_nbytes(embed_view);
670-
set_timestep_embedding(timesteps, embed_view, out_dim);
671-
// print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
672-
GGML_ASSERT(offset == ggml_nbytes(vec));
673-
}
674-
return std::make_tuple(result, vec, clsm);
556+
auto cond = get_learned_condition_common(work_ctx, tokens, weights, clip_skip, width, height, force_zero_embeddings);
557+
return std::make_tuple(cond.first, cond.second, clsm);
675558
}
676559

677560
ggml_tensor* id_encoder(ggml_context* work_ctx,
678561
ggml_tensor* init_img,
679562
ggml_tensor* prompts_embeds,
680563
std::vector<bool>& class_tokens_mask) {
681-
// size_t total_hidden_size = cond_stage_model.text_model.hidden_size;
682-
// if (version == VERSION_XL) {
683-
// total_hidden_size += cond_stage_model.text_model2.hidden_size;
684-
// }
685-
// ggml_tensor *res = ggml_new_tensor_2d(work_ctx,
686-
// prompts_embeds->type,
687-
// total_hidden_size,
688-
// cond_stage_model.text_model.max_position_embeddings);
689-
690-
// pmid_model.alloc_compute_buffer(work_ctx, init_img, prompts_embeds, class_tokens_mask);
691-
// pmid_model.compute(n_threads, init_img, prompts_embeds, class_tokens_mask, res);
692-
// pmid_model.free_compute_buffer();
693-
694564
ggml_tensor* res = NULL;
695565
pmid_model->compute(n_threads, init_img, prompts_embeds, class_tokens_mask, &res, work_ctx);
696566

@@ -703,10 +573,20 @@ class StableDiffusionGGML {
703573
int width,
704574
int height,
705575
bool force_zero_embeddings = false) {
706-
cond_stage_model->set_clip_skip(clip_skip);
707576
auto tokens_and_weights = cond_stage_model->tokenize(text, true);
708577
std::vector<int>& tokens = tokens_and_weights.first;
709578
std::vector<float>& weights = tokens_and_weights.second;
579+
return get_learned_condition_common(work_ctx, tokens, weights, clip_skip, width, height, force_zero_embeddings);
580+
}
581+
582+
std::pair<ggml_tensor*, ggml_tensor*> get_learned_condition_common(ggml_context* work_ctx,
583+
std::vector<int>& tokens,
584+
std::vector<float>& weights,
585+
int clip_skip,
586+
int width,
587+
int height,
588+
bool force_zero_embeddings = false) {
589+
cond_stage_model->set_clip_skip(clip_skip);
710590
int64_t t0 = ggml_time_ms();
711591
struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
712592
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size]
@@ -1691,6 +1571,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
16911571

16921572
struct ggml_init_params params;
16931573
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
1574+
if (sd_ctx->sd->stacked_id) {
1575+
params.mem_size += static_cast<size_t>(10 * 1024 * 1024); // 10 MB
1576+
}
16941577
params.mem_size += width * height * 3 * sizeof(float);
16951578
params.mem_size *= batch_count;
16961579
params.mem_buffer = NULL;

0 commit comments

Comments
 (0)