@@ -66,17 +66,21 @@ struct SDParams {
66
66
// models
67
67
std::string model_path;
68
68
std::string clip_l_path;
69
+ std::string clip_g_path;
69
70
std::string t5xxl_path;
70
71
std::string diffusion_model_path;
71
72
std::string vae_path;
72
- // std::string taesd_path;
73
+ std::string taesd_path;
74
+ std::string esrgan_path;
75
+ std::string controlnet_path;
73
76
std::string embeddings_path;
74
77
std::string stacked_id_embeddings_path;
75
- std::string lora_model_dir;
76
-
78
+ std::string input_id_images_path;
77
79
sd_type_t wtype = SD_TYPE_COUNT;
80
+ std::string lora_model_dir;
78
81
std::string output_path = " output.png" ;
79
82
std::string input_path;
83
+ std::string control_image_path;
80
84
81
85
std::string prompt;
82
86
std::string negative_prompt;
@@ -93,17 +97,22 @@ struct SDParams {
93
97
schedule_t schedule = DEFAULT;
94
98
int sample_steps = 20 ;
95
99
float strength = 0 .75f ;
100
+ float control_strength = 0 .9f ;
96
101
rng_type_t rng_type = CUDA_RNG;
97
102
int64_t seed = 42 ;
98
103
bool verbose = false ;
99
104
bool vae_tiling = false ;
105
+ bool control_net_cpu = false ;
100
106
bool normalize_input = false ;
101
107
bool clip_on_cpu = false ;
102
108
bool vae_on_cpu = false ;
109
+ bool diffusion_flash_attn = false ;
103
110
bool color = false ;
104
111
105
- // Photomaker params
106
- std::string input_id_images_path;
112
+ std::vector<int > skip_layers = {7 , 8 , 9 };
113
+ float slg_scale = 0 .;
114
+ float skip_layer_start = 0.01 ;
115
+ float skip_layer_end = 0.2 ;
107
116
108
117
// server things
109
118
int port = 8080 ;
@@ -113,24 +122,34 @@ struct SDParams {
113
122
void print_params (SDParams params) {
114
123
printf (" Option: \n " );
115
124
printf (" n_threads: %d\n " , params.n_threads );
125
+ printf (" mode: server\n " );
116
126
printf (" model_path: %s\n " , params.model_path .c_str ());
117
127
printf (" wtype: %s\n " , params.wtype < SD_TYPE_COUNT ? sd_type_name (params.wtype ) : " unspecified" );
118
128
printf (" clip_l_path: %s\n " , params.clip_l_path .c_str ());
129
+ printf (" clip_g_path: %s\n " , params.clip_g_path .c_str ());
119
130
printf (" t5xxl_path: %s\n " , params.t5xxl_path .c_str ());
120
131
printf (" diffusion_model_path: %s\n " , params.diffusion_model_path .c_str ());
121
132
printf (" vae_path: %s\n " , params.vae_path .c_str ());
122
- // printf(" taesd_path: %s\n", params.taesd_path.c_str());
133
+ printf (" taesd_path: %s\n " , params.taesd_path .c_str ());
134
+ printf (" controlnet_path: %s\n " , params.controlnet_path .c_str ());
123
135
printf (" embeddings_path: %s\n " , params.embeddings_path .c_str ());
124
136
printf (" stacked_id_embeddings_path: %s\n " , params.stacked_id_embeddings_path .c_str ());
137
+ printf (" input_id_images_path: %s\n " , params.input_id_images_path .c_str ());
125
138
printf (" style ratio: %.2f\n " , params.style_ratio );
126
- printf (" normzalize input image : %s\n " , params.normalize_input ? " true" : " false" );
139
+ printf (" normalize input image : %s\n " , params.normalize_input ? " true" : " false" );
127
140
printf (" output_path: %s\n " , params.output_path .c_str ());
141
+ printf (" init_img: %s\n " , params.input_path .c_str ());
142
+ printf (" control_image: %s\n " , params.control_image_path .c_str ());
128
143
printf (" clip on cpu: %s\n " , params.clip_on_cpu ? " true" : " false" );
144
+ printf (" controlnet cpu: %s\n " , params.control_net_cpu ? " true" : " false" );
129
145
printf (" vae decoder on cpu:%s\n " , params.vae_on_cpu ? " true" : " false" );
146
+ printf (" diffusion flash attention:%s\n " , params.diffusion_flash_attn ? " true" : " false" );
147
+ printf (" strength(control): %.2f\n " , params.control_strength );
130
148
printf (" prompt: %s\n " , params.prompt .c_str ());
131
149
printf (" negative_prompt: %s\n " , params.negative_prompt .c_str ());
132
150
printf (" min_cfg: %.2f\n " , params.min_cfg );
133
151
printf (" cfg_scale: %.2f\n " , params.cfg_scale );
152
+ printf (" slg_scale: %.2f\n " , params.slg_scale );
134
153
printf (" guidance: %.2f\n " , params.guidance );
135
154
printf (" clip_skip: %d\n " , params.clip_skip );
136
155
printf (" width: %d\n " , params.width );
@@ -150,40 +169,59 @@ void print_usage(int argc, const char* argv[]) {
150
169
printf (" \n " );
151
170
printf (" arguments:\n " );
152
171
printf (" -h, --help show this help message and exit\n " );
153
- printf (" -M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)\n " );
154
- printf (" -t, --threads N number of threads to use during computation (default: -1).\n " );
172
+ printf (" -t, --threads N number of threads to use during computation (default: -1)\n " );
155
173
printf (" If threads <= 0, then threads will be set to the number of CPU physical cores\n " );
156
174
printf (" -m, --model [MODEL] path to full model\n " );
157
175
printf (" --diffusion-model path to the standalone diffusion model\n " );
158
176
printf (" --clip_l path to the clip-l text encoder\n " );
159
- printf (" --t5xxl path to the the t5xxl text encoder.\n " );
177
+ printf (" --clip_g path to the clip-g text encoder\n " );
178
+ printf (" --t5xxl path to the the t5xxl text encoder\n " );
160
179
printf (" --vae [VAE] path to vae\n " );
161
- printf (" --embd-dir [EMBEDDING_PATH] path to embeddings.\n " );
180
+ printf (" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n " );
181
+ printf (" --control-net [CONTROL_PATH] path to control net model\n " );
182
+ printf (" --embd-dir [EMBEDDING_PATH] path to embeddings\n " );
183
+ printf (" --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings\n " );
184
+ printf (" --input-id-images-dir [DIR] path to PHOTOMAKER input id images dir\n " );
185
+ printf (" --normalize-input normalize PHOTOMAKER input id images\n " );
186
+ // printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
187
+ // printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n");
162
188
printf (" --type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k)\n " );
163
- printf (" If not specified, the default is the type of the weight file. \n " );
189
+ printf (" If not specified, the default is the type of the weight file\n " );
164
190
printf (" --lora-model-dir [DIR] lora model directory\n " );
191
+ printf (" --control-image [IMAGE] path to image condition, control net\n " );
165
192
printf (" -o, --output OUTPUT path to write result image to (default: ./output.png)\n " );
166
193
printf (" -p, --prompt [PROMPT] the prompt to render\n " );
167
194
printf (" -n, --negative-prompt PROMPT the negative prompt (default: \"\" )\n " );
168
195
printf (" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n " );
196
+ printf (" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n " );
197
+ printf (" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n " );
198
+ printf (" --skip_layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n " );
199
+ printf (" --skip_layer_start START SLG enabling point: (default: 0.01)\n " );
200
+ printf (" --skip_layer_end END SLG disabling point: (default: 0.2)\n " );
201
+ printf (" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n " );
169
202
printf (" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n " );
170
203
printf (" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20%%)\n " );
171
204
printf (" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n " );
172
205
printf (" 1.0 corresponds to full destruction of information in init image\n " );
173
206
printf (" -H, --height H image height, in pixel space (default: 512)\n " );
174
207
printf (" -W, --width W image width, in pixel space (default: 512)\n " );
175
- printf (" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n " );
208
+ printf (" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm}\n " );
176
209
printf (" sampling method (default: \" euler_a\" )\n " );
177
210
printf (" --steps STEPS number of sample steps (default: 20)\n " );
178
211
printf (" --rng {std_default, cuda} RNG (default: cuda)\n " );
179
212
printf (" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n " );
180
- printf (" -b, --batch-count COUNT number of images to generate. \n " );
181
- printf (" --schedule {discrete, karras, ays} Denoiser sigma schedule (default: discrete)\n " );
213
+ printf (" -b, --batch-count COUNT number of images to generate\n " );
214
+ printf (" --schedule {discrete, karras, exponential, ays, gits } Denoiser sigma schedule (default: discrete)\n " );
182
215
printf (" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n " );
183
216
printf (" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n " );
184
217
printf (" --vae-tiling process vae in tiles to reduce memory usage\n " );
185
218
printf (" --vae-on-cpu keep vae in cpu (for low vram)\n " );
186
- printf (" --clip-on-cpu keep clip in cpu (for low vram).\n " );
219
+ printf (" --clip-on-cpu keep clip in cpu (for low vram)\n " );
220
+ printf (" --diffusion-fa use flash attention in the diffusion model (for low vram)\n " );
221
+ printf (" Might lower quality, since it implies converting k and v to f16.\n " );
222
+ printf (" This might crash if it is not supported by the backend.\n " );
223
+ printf (" --control-net-cpu keep controlnet in cpu (for low vram)\n " );
224
+ printf (" --canny apply canny preprocessor (edge detection)\n " );
187
225
printf (" --color Colors the logging tags according to level\n " );
188
226
printf (" -v, --verbose print extra info\n " );
189
227
printf (" --port port used for server (default: 8080)\n " );
@@ -214,6 +252,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
214
252
break ;
215
253
}
216
254
params.clip_l_path = argv[i];
255
+ } else if (arg == " --clip_g" ) {
256
+ if (++i >= argc) {
257
+ invalid_arg = true ;
258
+ break ;
259
+ }
260
+ params.clip_g_path = argv[i];
217
261
} else if (arg == " --t5xxl" ) {
218
262
if (++i >= argc) {
219
263
invalid_arg = true ;
@@ -232,7 +276,42 @@ void parse_args(int argc, const char** argv, SDParams& params) {
232
276
break ;
233
277
}
234
278
params.vae_path = argv[i];
235
- // TODO Tiny AE
279
+ } else if (arg == " --taesd" ) {
280
+ if (++i >= argc) {
281
+ invalid_arg = true ;
282
+ break ;
283
+ }
284
+ params.taesd_path = argv[i];
285
+ } else if (arg == " --control-net" ) {
286
+ if (++i >= argc) {
287
+ invalid_arg = true ;
288
+ break ;
289
+ }
290
+ params.controlnet_path = argv[i];
291
+ } else if (arg == " --upscale-model" ) {
292
+ if (++i >= argc) {
293
+ invalid_arg = true ;
294
+ break ;
295
+ }
296
+ params.esrgan_path = argv[i];
297
+ } else if (arg == " --embd-dir" ) {
298
+ if (++i >= argc) {
299
+ invalid_arg = true ;
300
+ break ;
301
+ }
302
+ params.embeddings_path = argv[i];
303
+ } else if (arg == " --stacked-id-embd-dir" ) {
304
+ if (++i >= argc) {
305
+ invalid_arg = true ;
306
+ break ;
307
+ }
308
+ params.stacked_id_embeddings_path = argv[i];
309
+ } else if (arg == " --input-id-images-dir" ) {
310
+ if (++i >= argc) {
311
+ invalid_arg = true ;
312
+ break ;
313
+ }
314
+ params.input_id_images_path = argv[i];
236
315
} else if (arg == " --type" ) {
237
316
if (++i >= argc) {
238
317
invalid_arg = true ;
@@ -270,6 +349,18 @@ void parse_args(int argc, const char** argv, SDParams& params) {
270
349
break ;
271
350
}
272
351
params.lora_model_dir = argv[i];
352
+ } else if (arg == " -i" || arg == " --init-img" ) {
353
+ if (++i >= argc) {
354
+ invalid_arg = true ;
355
+ break ;
356
+ }
357
+ params.input_path = argv[i];
358
+ } else if (arg == " --control-image" ) {
359
+ if (++i >= argc) {
360
+ invalid_arg = true ;
361
+ break ;
362
+ }
363
+ params.control_image_path = argv[i];
273
364
} else if (arg == " -o" || arg == " --output" ) {
274
365
if (++i >= argc) {
275
366
invalid_arg = true ;
@@ -312,6 +403,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
312
403
break ;
313
404
}
314
405
params.style_ratio = std::stof (argv[i]);
406
+ } else if (arg == " --control-strength" ) {
407
+ if (++i >= argc) {
408
+ invalid_arg = true ;
409
+ break ;
410
+ }
411
+ params.control_strength = std::stof (argv[i]);
315
412
} else if (arg == " -H" || arg == " --height" ) {
316
413
if (++i >= argc) {
317
414
invalid_arg = true ;
@@ -338,12 +435,16 @@ void parse_args(int argc, const char** argv, SDParams& params) {
338
435
params.clip_skip = std::stoi (argv[i]);
339
436
} else if (arg == " --vae-tiling" ) {
340
437
params.vae_tiling = true ;
438
+ } else if (arg == " --control-net-cpu" ) {
439
+ params.control_net_cpu = true ;
341
440
} else if (arg == " --normalize-input" ) {
342
441
params.normalize_input = true ;
343
442
} else if (arg == " --clip-on-cpu" ) {
344
443
params.clip_on_cpu = true ; // will slow down get_learned_condiotion but necessary for low MEM GPUs
345
444
} else if (arg == " --vae-on-cpu" ) {
346
445
params.vae_on_cpu = true ; // will slow down latent decoding but necessary for low MEM GPUs
446
+ } else if (arg == " --diffusion-fa" ) {
447
+ params.diffusion_flash_attn = true ; // can reduce MEM significantly
347
448
} else if (arg == " -b" || arg == " --batch-count" ) {
348
449
if (++i >= argc) {
349
450
invalid_arg = true ;
@@ -411,6 +512,61 @@ void parse_args(int argc, const char** argv, SDParams& params) {
411
512
params.verbose = true ;
412
513
} else if (arg == " --color" ) {
413
514
params.color = true ;
515
+ } else if (arg == " --slg-scale" ) {
516
+ if (++i >= argc) {
517
+ invalid_arg = true ;
518
+ break ;
519
+ }
520
+ params.slg_scale = std::stof (argv[i]);
521
+ } else if (arg == " --skip-layers" ) {
522
+ if (++i >= argc) {
523
+ invalid_arg = true ;
524
+ break ;
525
+ }
526
+ if (argv[i][0 ] != ' [' ) {
527
+ invalid_arg = true ;
528
+ break ;
529
+ }
530
+ std::string layers_str = argv[i];
531
+ while (layers_str.back () != ' ]' ) {
532
+ if (++i >= argc) {
533
+ invalid_arg = true ;
534
+ break ;
535
+ }
536
+ layers_str += " " + std::string (argv[i]);
537
+ }
538
+ layers_str = layers_str.substr (1 , layers_str.size () - 2 );
539
+
540
+ std::regex regex (" [, ]+" );
541
+ std::sregex_token_iterator iter (layers_str.begin (), layers_str.end (), regex, -1 );
542
+ std::sregex_token_iterator end;
543
+ std::vector<std::string> tokens (iter, end);
544
+ std::vector<int > layers;
545
+ for (const auto & token : tokens) {
546
+ try {
547
+ layers.push_back (std::stoi (token));
548
+ } catch (const std::invalid_argument& e) {
549
+ invalid_arg = true ;
550
+ break ;
551
+ }
552
+ }
553
+ params.skip_layers = layers;
554
+
555
+ if (invalid_arg) {
556
+ break ;
557
+ }
558
+ } else if (arg == " --skip-layer-start" ) {
559
+ if (++i >= argc) {
560
+ invalid_arg = true ;
561
+ break ;
562
+ }
563
+ params.skip_layer_start = std::stof (argv[i]);
564
+ } else if (arg == " --skip-layer-end" ) {
565
+ if (++i >= argc) {
566
+ invalid_arg = true ;
567
+ break ;
568
+ }
569
+ params.skip_layer_end = std::stof (argv[i]);
414
570
} else if (arg == " --port" ) {
415
571
if (++i >= argc) {
416
572
invalid_arg = true ;
@@ -716,11 +872,12 @@ int main(int argc, const char* argv[]) {
716
872
717
873
sd_ctx_t * sd_ctx = new_sd_ctx (params.model_path .c_str (),
718
874
params.clip_l_path .c_str (),
875
+ params.clip_g_path .c_str (),
719
876
params.t5xxl_path .c_str (),
720
877
params.diffusion_model_path .c_str (),
721
878
params.vae_path .c_str (),
722
- " " ,
723
- " " ,
879
+ params. taesd_path . c_str () ,
880
+ params. controlnet_path . c_str () ,
724
881
params.lora_model_dir .c_str (),
725
882
params.embeddings_path .c_str (),
726
883
params.stacked_id_embeddings_path .c_str (),
@@ -732,8 +889,9 @@ int main(int argc, const char* argv[]) {
732
889
params.rng_type ,
733
890
params.schedule ,
734
891
params.clip_on_cpu ,
735
- true ,
736
- params.vae_on_cpu );
892
+ params.control_net_cpu ,
893
+ params.vae_on_cpu ,
894
+ params.diffusion_flash_attn );
737
895
738
896
if (sd_ctx == NULL ) {
739
897
printf (" new_sd_ctx_t failed\n " );
@@ -787,7 +945,12 @@ int main(int argc, const char* argv[]) {
787
945
1 ,
788
946
params.style_ratio ,
789
947
params.normalize_input ,
790
- params.input_id_images_path .c_str ());
948
+ params.input_id_images_path .c_str (),
949
+ params.skip_layers .data (),
950
+ params.skip_layers .size (),
951
+ params.slg_scale ,
952
+ params.skip_layer_start ,
953
+ params.skip_layer_end );
791
954
792
955
if (results == NULL ) {
793
956
printf (" generate failed\n " );
0 commit comments