@@ -241,7 +241,7 @@ class CLIPTokenizer {
241
241
std::vector<int > tokenize (std::string text,
242
242
on_new_token_cb_t on_new_token_cb,
243
243
size_t max_length = 0 ,
244
- bool padding = false ) {
244
+ bool padding = false ) {
245
245
std::vector<int32_t > tokens = encode (text, on_new_token_cb);
246
246
tokens.insert (tokens.begin (), BOS_TOKEN_ID);
247
247
if (max_length > 0 ) {
@@ -486,7 +486,6 @@ struct ResidualAttentionBlock {
486
486
487
487
ln2_w = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hidden_size);
488
488
ln2_b = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hidden_size);
489
-
490
489
}
491
490
492
491
void map_by_name (std::map<std::string, struct ggml_tensor *>& tensors, const std::string prefix) {
@@ -661,8 +660,8 @@ struct CLIPTextModel {
661
660
mem_size += ggml_row_size (GGML_TYPE_I32, hidden_size * max_position_embeddings); // position_ids
662
661
mem_size += ggml_row_size (wtype, hidden_size * vocab_size); // token_embed_weight
663
662
mem_size += ggml_row_size (wtype, hidden_size * max_position_embeddings); // position_embed_weight
664
- if (version == OPENAI_CLIP_VIT_L_14) {
665
- mem_size += ggml_row_size (wtype, hidden_size * max_position_embeddings); // token_embed_custom
663
+ if (version == OPENAI_CLIP_VIT_L_14) {
664
+ mem_size += ggml_row_size (wtype, hidden_size * max_position_embeddings); // token_embed_custom
666
665
}
667
666
for (int i = 0 ; i < num_hidden_layers; i++) {
668
667
mem_size += resblocks[i].calculate_mem_size (wtype);
@@ -688,32 +687,32 @@ struct CLIPTextModel {
688
687
}
689
688
}
690
689
691
- bool load_embedding (std::string embd_name, std::string embd_path, std::vector<int32_t > & bpe_tokens) {
690
+ bool load_embedding (std::string embd_name, std::string embd_path, std::vector<int32_t >& bpe_tokens) {
692
691
// the order matters
693
692
ModelLoader model_loader;
694
- if (!model_loader.init_from_file (embd_path)) {
693
+ if (!model_loader.init_from_file (embd_path)) {
695
694
LOG_ERROR (" embedding '%s' failed" , embd_name.c_str ());
696
695
return false ;
697
696
}
698
697
struct ggml_init_params params;
699
- params.mem_size = 32 * 1024 ; // max for custom embeddings 32 KB
700
- params.mem_buffer = NULL ;
701
- params.no_alloc = false ;
698
+ params.mem_size = 32 * 1024 ; // max for custom embeddings 32 KB
699
+ params.mem_buffer = NULL ;
700
+ params.no_alloc = false ;
702
701
struct ggml_context * embd_ctx = ggml_init (params);
703
- struct ggml_tensor * embd = NULL ;
704
- auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
705
- if (tensor_storage.ne [0 ] != hidden_size) {
702
+ struct ggml_tensor * embd = NULL ;
703
+ auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
704
+ if (tensor_storage.ne [0 ] != hidden_size) {
706
705
LOG_DEBUG (" embedding wrong hidden size, got %i, expected %i" , tensor_storage.ne [0 ], hidden_size);
707
706
return false ;
708
707
}
709
- embd = ggml_new_tensor_2d (embd_ctx, token_embed_weight->type , hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne [1 ] : 1 );
708
+ embd = ggml_new_tensor_2d (embd_ctx, token_embed_weight->type , hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne [1 ] : 1 );
710
709
*dst_tensor = embd;
711
710
return true ;
712
711
};
713
712
model_loader.load_tensors (on_load, NULL );
714
713
ggml_backend_tensor_set (token_embed_custom, embd->data , num_custom_embeddings * hidden_size * ggml_type_size (token_embed_custom->type ), ggml_nbytes (embd));
715
714
readed_embeddings.push_back (embd_name);
716
- for (int i = 0 ; i < embd->ne [1 ]; i++) {
715
+ for (int i = 0 ; i < embd->ne [1 ]; i++) {
717
716
bpe_tokens.push_back (vocab_size + num_custom_embeddings);
718
717
// LOG_DEBUG("new custom token: %i", vocab_size + num_custom_embeddings);
719
718
num_custom_embeddings++;
@@ -775,7 +774,7 @@ struct CLIPTextModel {
775
774
776
775
final_ln_b = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hidden_size);
777
776
778
- if (version == OPENAI_CLIP_VIT_L_14) {
777
+ if (version == OPENAI_CLIP_VIT_L_14) {
779
778
token_embed_custom = ggml_new_tensor_2d (ctx, wtype, hidden_size, max_position_embeddings);
780
779
}
781
780
@@ -878,11 +877,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
878
877
879
878
auto hidden_states2 = text_model2.forward (ctx0, input_ids2, NULL ); // [N, n_token, hidden_size2]
880
879
hidden_states2 = ggml_reshape_4d (ctx0,
881
- hidden_states2,
882
- hidden_states2->ne [0 ],
883
- hidden_states2->ne [1 ],
884
- hidden_states2->ne [2 ],
885
- hidden_states2->ne [3 ]);
880
+ hidden_states2,
881
+ hidden_states2->ne [0 ],
882
+ hidden_states2->ne [1 ],
883
+ hidden_states2->ne [2 ],
884
+ hidden_states2->ne [3 ]);
886
885
hidden_states2 = ggml_cont (ctx0, ggml_permute (ctx0, hidden_states2, 2 , 0 , 1 , 3 ));
887
886
888
887
hidden_states = ggml_concat (ctx0, hidden_states, hidden_states2); // [N, n_token, hidden_size + hidden_size2]
@@ -913,20 +912,20 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
913
912
LOG_DEBUG (" parse '%s' to %s" , text.c_str (), ss.str ().c_str ());
914
913
}
915
914
916
- auto on_new_token_cb = [&] (std::string& str, std::vector<int32_t > & bpe_tokens) -> bool {
917
- size_t word_end = str.find (" ," );
915
+ auto on_new_token_cb = [&](std::string& str, std::vector<int32_t >& bpe_tokens) -> bool {
916
+ size_t word_end = str.find (" ," );
918
917
std::string embd_name = word_end == std::string::npos ? str : str.substr (0 , word_end);
919
- embd_name = trim (embd_name);
918
+ embd_name = trim (embd_name);
920
919
std::string embd_path = get_full_path (text_model.embd_dir , embd_name + " .pt" );
921
- if (embd_path.size () == 0 ) {
920
+ if (embd_path.size () == 0 ) {
922
921
embd_path = get_full_path (text_model.embd_dir , embd_name + " .ckpt" );
923
922
}
924
- if (embd_path.size () == 0 ) {
923
+ if (embd_path.size () == 0 ) {
925
924
embd_path = get_full_path (text_model.embd_dir , embd_name + " .safetensors" );
926
925
}
927
- if (embd_path.size () > 0 ) {
928
- if (text_model.load_embedding (embd_name, embd_path, bpe_tokens)) {
929
- if (word_end != std::string::npos) {
926
+ if (embd_path.size () > 0 ) {
927
+ if (text_model.load_embedding (embd_name, embd_path, bpe_tokens)) {
928
+ if (word_end != std::string::npos) {
930
929
str = str.substr (word_end);
931
930
} else {
932
931
str = " " ;
@@ -1033,7 +1032,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
1033
1032
1034
1033
struct ggml_tensor * embeddings = NULL ;
1035
1034
1036
- if (text_model.num_custom_embeddings > 0 && version != VERSION_XL) {
1035
+ if (text_model.num_custom_embeddings > 0 && version != VERSION_XL) {
1037
1036
embeddings = ggml_new_tensor_2d (ctx0, wtype, text_model.hidden_size , text_model.vocab_size + text_model.num_custom_embeddings /* custom placeholder */ );
1038
1037
ggml_allocr_alloc (allocr, embeddings);
1039
1038
if (!ggml_allocr_is_measure (allocr)) {
0 commit comments