@@ -46,7 +46,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
46
46
SDVersion version = VERSION_SD1;
47
47
PMVersion pm_version = PM_VERSION_1;
48
48
CLIPTokenizer tokenizer;
49
- ggml_type wtype;
50
49
std::shared_ptr<CLIPTextModelRunner> text_model;
51
50
std::shared_ptr<CLIPTextModelRunner> text_model2;
52
51
@@ -57,25 +56,25 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
57
56
std::vector<std::string> readed_embeddings;
58
57
59
58
FrozenCLIPEmbedderWithCustomWords (ggml_backend_t backend,
60
- ggml_type wtype ,
59
+ std::map<std::string, enum ggml_type>& tensor_types ,
61
60
const std::string& embd_dir,
62
61
SDVersion version = VERSION_SD1,
63
62
PMVersion pv = PM_VERSION_1,
64
63
int clip_skip = -1 )
65
- : version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407 ), embd_dir(embd_dir), wtype(wtype) {
64
+ : version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407 ), embd_dir(embd_dir) {
66
65
if (clip_skip <= 0 ) {
67
66
clip_skip = 1 ;
68
67
if (version == VERSION_SD2 || version == VERSION_SDXL) {
69
68
clip_skip = 2 ;
70
69
}
71
70
}
72
71
if (version == VERSION_SD1) {
73
- text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPENAI_CLIP_VIT_L_14, clip_skip);
72
+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model " , OPENAI_CLIP_VIT_L_14, clip_skip);
74
73
} else if (version == VERSION_SD2) {
75
- text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPEN_CLIP_VIT_H_14, clip_skip);
74
+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model " , OPEN_CLIP_VIT_H_14, clip_skip);
76
75
} else if (version == VERSION_SDXL) {
77
- text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPENAI_CLIP_VIT_L_14, clip_skip, false );
78
- text_model2 = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
76
+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model " , OPENAI_CLIP_VIT_L_14, clip_skip, false );
77
+ text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.1.transformer.text_model " , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
79
78
}
80
79
}
81
80
@@ -138,14 +137,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
138
137
LOG_DEBUG (" embedding wrong hidden size, got %i, expected %i" , tensor_storage.ne [0 ], hidden_size);
139
138
return false ;
140
139
}
141
- embd = ggml_new_tensor_2d (embd_ctx, wtype , hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne [1 ] : 1 );
140
+ embd = ggml_new_tensor_2d (embd_ctx, tensor_storage. type , hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne [1 ] : 1 );
142
141
*dst_tensor = embd;
143
142
return true ;
144
143
};
145
144
model_loader.load_tensors (on_load, NULL );
146
145
readed_embeddings.push_back (embd_name);
147
146
token_embed_custom.resize (token_embed_custom.size () + ggml_nbytes (embd));
148
- memcpy ((void *)(token_embed_custom.data () + num_custom_embeddings * hidden_size * ggml_type_size (wtype )),
147
+ memcpy ((void *)(token_embed_custom.data () + num_custom_embeddings * hidden_size * ggml_type_size (embd-> type )),
149
148
embd->data ,
150
149
ggml_nbytes (embd));
151
150
for (int i = 0 ; i < embd->ne [1 ]; i++) {
@@ -590,9 +589,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
590
589
struct FrozenCLIPVisionEmbedder : public GGMLRunner {
591
590
CLIPVisionModelProjection vision_model;
592
591
593
- FrozenCLIPVisionEmbedder (ggml_backend_t backend, ggml_type wtype )
594
- : vision_model(OPEN_CLIP_VIT_H_14, true ), GGMLRunner(backend, wtype ) {
595
- vision_model.init (params_ctx, wtype );
592
+ FrozenCLIPVisionEmbedder (ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types )
593
+ : vision_model(OPEN_CLIP_VIT_H_14, true ), GGMLRunner(backend) {
594
+ vision_model.init (params_ctx, tensor_types, " cond_stage_model.transformer " );
596
595
}
597
596
598
597
std::string get_desc () {
@@ -627,7 +626,6 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
627
626
};
628
627
629
628
struct SD3CLIPEmbedder : public Conditioner {
630
- ggml_type wtype;
631
629
CLIPTokenizer clip_l_tokenizer;
632
630
CLIPTokenizer clip_g_tokenizer;
633
631
T5UniGramTokenizer t5_tokenizer;
@@ -636,15 +634,15 @@ struct SD3CLIPEmbedder : public Conditioner {
636
634
std::shared_ptr<T5Runner> t5;
637
635
638
636
SD3CLIPEmbedder (ggml_backend_t backend,
639
- ggml_type wtype ,
637
+ std::map<std::string, enum ggml_type>& tensor_types ,
640
638
int clip_skip = -1 )
641
- : wtype(wtype), clip_g_tokenizer(0 ) {
639
+ : clip_g_tokenizer(0 ) {
642
640
if (clip_skip <= 0 ) {
643
641
clip_skip = 2 ;
644
642
}
645
- clip_l = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPENAI_CLIP_VIT_L_14, clip_skip, false );
646
- clip_g = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
647
- t5 = std::make_shared<T5Runner>(backend, wtype );
643
+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model " , OPENAI_CLIP_VIT_L_14, clip_skip, false );
644
+ clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_g.transformer.text_model " , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
645
+ t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer " );
648
646
}
649
647
650
648
void set_clip_skip (int clip_skip) {
@@ -974,21 +972,19 @@ struct SD3CLIPEmbedder : public Conditioner {
974
972
};
975
973
976
974
struct FluxCLIPEmbedder : public Conditioner {
977
- ggml_type wtype;
978
975
CLIPTokenizer clip_l_tokenizer;
979
976
T5UniGramTokenizer t5_tokenizer;
980
977
std::shared_ptr<CLIPTextModelRunner> clip_l;
981
978
std::shared_ptr<T5Runner> t5;
982
979
983
980
FluxCLIPEmbedder (ggml_backend_t backend,
984
- ggml_type wtype,
985
- int clip_skip = -1 )
986
- : wtype(wtype) {
981
+ std::map<std::string, enum ggml_type>& tensor_types,
982
+ int clip_skip = -1 ) {
987
983
if (clip_skip <= 0 ) {
988
984
clip_skip = 2 ;
989
985
}
990
- clip_l = std::make_shared<CLIPTextModelRunner>(backend, wtype , OPENAI_CLIP_VIT_L_14, clip_skip, true );
991
- t5 = std::make_shared<T5Runner>(backend, wtype );
986
+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model " , OPENAI_CLIP_VIT_L_14, clip_skip, true );
987
+ t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer " );
992
988
}
993
989
994
990
void set_clip_skip (int clip_skip) {
0 commit comments