@@ -533,13 +533,11 @@ class StableDiffusionGGML {
533
533
int height,
534
534
int num_input_imgs,
535
535
bool force_zero_embeddings = false ) {
536
- cond_stage_model->set_clip_skip (clip_skip);
537
536
auto image_tokens = cond_stage_model->convert_token_to_id (trigger_word);
538
537
// if(image_tokens.size() == 1){
539
538
// printf(" image token id is: %d \n", image_tokens[0]);
540
539
// }
541
540
GGML_ASSERT (image_tokens.size () == 1 );
542
- // auto tokens_and_weights = cond_stage_model.tokenize(text, true);
543
541
auto tokens_and_weights = cond_stage_model->tokenize_with_trigger_token (text,
544
542
num_input_imgs,
545
543
image_tokens[0 ],
@@ -555,142 +553,14 @@ class StableDiffusionGGML {
555
553
// for(int i = 0; i < clsm.size(); ++i)
556
554
// printf("%d ", clsm[i]?1:0);
557
555
// printf("\n");
558
- int64_t t0 = ggml_time_ms ();
559
- struct ggml_tensor * hidden_states = NULL ; // [N, n_token, hidden_size]
560
- struct ggml_tensor * pooled = NULL ;
561
- // size_t total_hidden_size = cond_stage_model.text_model.hidden_size;
562
- // if (version == VERSION_XL) {
563
- // total_hidden_size += cond_stage_model.text_model2.hidden_size;
564
- // pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, cond_stage_model.text_model2.projection_dim);
565
- // }
566
- // struct ggml_tensor* hidden_states = ggml_new_tensor_2d(work_ctx,
567
- // GGML_TYPE_F32,
568
- // total_hidden_size,
569
- // cond_stage_model.text_model.max_position_embeddings); // [N, n_token, hidden_size]
570
- // cond_stage_model.alloc_compute_buffer(work_ctx, (int)tokens.size());
571
- // cond_stage_model.compute(n_threads, tokens, hidden_states, pooled);
572
- // cond_stage_model.free_compute_buffer();
573
-
574
- auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, tokens);
575
- struct ggml_tensor * input_ids2 = NULL ;
576
- size_t max_token_idx = 0 ;
577
- if (version == VERSION_XL) {
578
- auto it = std::find (tokens.begin (), tokens.end (), EOS_TOKEN_ID);
579
- if (it != tokens.end ()) {
580
- std::fill (std::next (it), tokens.end (), 0 );
581
- }
582
-
583
- max_token_idx = std::min<size_t >(std::distance (tokens.begin (), it), tokens.size () - 1 );
584
-
585
- input_ids2 = vector_to_ggml_tensor_i32 (work_ctx, tokens);
586
-
587
- // for (int i = 0; i < tokens.size(); i++) {
588
- // printf("%d ", tokens[i]);
589
- // }
590
- // printf("\n");
591
- }
592
-
593
- cond_stage_model->compute (n_threads, input_ids, input_ids2, max_token_idx, false , &hidden_states, work_ctx);
594
- if (version == VERSION_XL) {
595
- cond_stage_model->compute (n_threads, input_ids, input_ids2, max_token_idx, true , &pooled, work_ctx);
596
- }
597
-
598
- // cond_stage_model->compute(n_threads, tokens, false, &hidden_states, work_ctx);
599
- // if (version == VERSION_XL) {
600
- // cond_stage_model->compute(n_threads, tokens, true, &pooled, work_ctx);
601
- // }
602
- // if (pooled != NULL) {
603
- // print_ggml_tensor(hidden_states);
604
- // print_ggml_tensor(pooled);
605
- // }
606
-
607
- int64_t t1 = ggml_time_ms ();
608
- LOG_DEBUG (" computing condition graph completed, taking %" PRId64 " ms" , t1 - t0);
609
- ggml_tensor* result = ggml_dup_tensor (work_ctx, hidden_states);
610
- {
611
- float original_mean = ggml_tensor_mean (hidden_states);
612
- for (int i2 = 0 ; i2 < hidden_states->ne [2 ]; i2++) {
613
- for (int i1 = 0 ; i1 < hidden_states->ne [1 ]; i1++) {
614
- for (int i0 = 0 ; i0 < hidden_states->ne [0 ]; i0++) {
615
- float value = ggml_tensor_get_f32 (hidden_states, i0, i1, i2);
616
- value *= weights[i1];
617
- ggml_tensor_set_f32 (result, value, i0, i1, i2);
618
- }
619
- }
620
- }
621
- float new_mean = ggml_tensor_mean (result);
622
- ggml_tensor_scale (result, (original_mean / new_mean));
623
- }
624
- if (force_zero_embeddings) {
625
- float * vec = (float *)result->data ;
626
- for (int i = 0 ; i < ggml_nelements (result); i++) {
627
- vec[i] = 0 ;
628
- }
629
- }
630
-
631
- ggml_tensor* vec = NULL ;
632
- if (version == VERSION_XL) {
633
- int out_dim = 256 ;
634
- // vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels);
635
- vec = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, diffusion_model->unet .adm_in_channels );
636
- // [0:1280]
637
- size_t offset = 0 ;
638
- memcpy (vec->data , pooled->data , ggml_nbytes (pooled));
639
- offset += ggml_nbytes (pooled);
640
-
641
- // struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2);
642
- // original_size_as_tuple
643
- float orig_width = (float )width;
644
- float orig_height = (float )height;
645
- std::vector<float > timesteps = {orig_height, orig_width};
646
- // ggml_tensor_set_f32(timesteps, orig_height, 0);
647
- // ggml_tensor_set_f32(timesteps, orig_width, 1);
648
- ggml_tensor* embed_view = ggml_view_2d (work_ctx, vec, out_dim, 2 , ggml_type_size (GGML_TYPE_F32) * out_dim, offset);
649
- offset += ggml_nbytes (embed_view);
650
- set_timestep_embedding (timesteps, embed_view, out_dim);
651
- // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
652
- // crop_coords_top_left
653
- float crop_coord_top = 0 .f ;
654
- float crop_coord_left = 0 .f ;
655
- timesteps = {crop_coord_top, crop_coord_left};
656
- // ggml_tensor_set_f32(timesteps, crop_coord_top, 0);
657
- // ggml_tensor_set_f32(timesteps, crop_coord_left, 1);
658
- embed_view = ggml_view_2d (work_ctx, vec, out_dim, 2 , ggml_type_size (GGML_TYPE_F32) * out_dim, offset);
659
- offset += ggml_nbytes (embed_view);
660
- set_timestep_embedding (timesteps, embed_view, out_dim);
661
- // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
662
- // target_size_as_tuple
663
- float target_width = (float )width;
664
- float target_height = (float )height;
665
- // ggml_tensor_set_f32(timesteps, target_height, 0);
666
- // ggml_tensor_set_f32(timesteps, target_width, 1);
667
- timesteps = {target_height, target_width};
668
- embed_view = ggml_view_2d (work_ctx, vec, out_dim, 2 , ggml_type_size (GGML_TYPE_F32) * out_dim, offset);
669
- offset += ggml_nbytes (embed_view);
670
- set_timestep_embedding (timesteps, embed_view, out_dim);
671
- // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
672
- GGML_ASSERT (offset == ggml_nbytes (vec));
673
- }
674
- return std::make_tuple (result, vec, clsm);
556
+ auto cond = get_learned_condition_common (work_ctx, tokens, weights, clip_skip, width, height, force_zero_embeddings);
557
+ return std::make_tuple (cond.first , cond.second , clsm);
675
558
}
676
559
677
560
ggml_tensor* id_encoder (ggml_context* work_ctx,
678
561
ggml_tensor* init_img,
679
562
ggml_tensor* prompts_embeds,
680
563
std::vector<bool >& class_tokens_mask) {
681
- // size_t total_hidden_size = cond_stage_model.text_model.hidden_size;
682
- // if (version == VERSION_XL) {
683
- // total_hidden_size += cond_stage_model.text_model2.hidden_size;
684
- // }
685
- // ggml_tensor *res = ggml_new_tensor_2d(work_ctx,
686
- // prompts_embeds->type,
687
- // total_hidden_size,
688
- // cond_stage_model.text_model.max_position_embeddings);
689
-
690
- // pmid_model.alloc_compute_buffer(work_ctx, init_img, prompts_embeds, class_tokens_mask);
691
- // pmid_model.compute(n_threads, init_img, prompts_embeds, class_tokens_mask, res);
692
- // pmid_model.free_compute_buffer();
693
-
694
564
ggml_tensor* res = NULL ;
695
565
pmid_model->compute (n_threads, init_img, prompts_embeds, class_tokens_mask, &res, work_ctx);
696
566
@@ -703,10 +573,20 @@ class StableDiffusionGGML {
703
573
int width,
704
574
int height,
705
575
bool force_zero_embeddings = false ) {
706
- cond_stage_model->set_clip_skip (clip_skip);
707
576
auto tokens_and_weights = cond_stage_model->tokenize (text, true );
708
577
std::vector<int >& tokens = tokens_and_weights.first ;
709
578
std::vector<float >& weights = tokens_and_weights.second ;
579
+ return get_learned_condition_common (work_ctx, tokens, weights, clip_skip, width, height, force_zero_embeddings);
580
+ }
581
+
582
+ std::pair<ggml_tensor*, ggml_tensor*> get_learned_condition_common (ggml_context* work_ctx,
583
+ std::vector<int >& tokens,
584
+ std::vector<float >& weights,
585
+ int clip_skip,
586
+ int width,
587
+ int height,
588
+ bool force_zero_embeddings = false ) {
589
+ cond_stage_model->set_clip_skip (clip_skip);
710
590
int64_t t0 = ggml_time_ms ();
711
591
struct ggml_tensor * hidden_states = NULL ; // [N, n_token, hidden_size]
712
592
struct ggml_tensor * chunk_hidden_states = NULL ; // [n_token, hidden_size]
@@ -1691,6 +1571,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
1691
1571
1692
1572
struct ggml_init_params params;
1693
1573
params.mem_size = static_cast <size_t >(10 * 1024 * 1024 ); // 10 MB
1574
+ if (sd_ctx->sd ->stacked_id ) {
1575
+ params.mem_size += static_cast <size_t >(10 * 1024 * 1024 ); // 10 MB
1576
+ }
1694
1577
params.mem_size += width * height * 3 * sizeof (float );
1695
1578
params.mem_size *= batch_count;
1696
1579
params.mem_buffer = NULL ;
0 commit comments