@@ -356,7 +356,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
356
356
requests []* extProcPb.ProcessingRequest
357
357
pods map [backendmetrics.Pod ]* backendmetrics.Metrics
358
358
wantResponses []* extProcPb.ProcessingResponse
359
- wantMetrics string
359
+ wantMetrics map [ string ] string
360
360
wantErr bool
361
361
immediateResponse * extProcPb.ImmediateResponse
362
362
}{
@@ -379,11 +379,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
379
379
KVCacheUsagePercent : 0.2 ,
380
380
},
381
381
},
382
- wantMetrics : `
382
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
383
383
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
384
384
# TYPE inference_model_request_total counter
385
385
inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1
386
- ` ,
386
+ ` } ,
387
387
wantErr : false ,
388
388
wantResponses : []* extProcPb.ProcessingResponse {
389
389
{
@@ -460,11 +460,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
460
460
},
461
461
},
462
462
},
463
- wantMetrics : `
463
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
464
464
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
465
465
# TYPE inference_model_request_total counter
466
466
inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1
467
- ` ,
467
+ ` } ,
468
468
wantErr : false ,
469
469
wantResponses : []* extProcPb.ProcessingResponse {
470
470
{
@@ -541,11 +541,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
541
541
},
542
542
},
543
543
},
544
- wantMetrics : `
544
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
545
545
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
546
546
# TYPE inference_model_request_total counter
547
547
inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1
548
- ` ,
548
+ ` } ,
549
549
wantErr : false ,
550
550
wantResponses : []* extProcPb.ProcessingResponse {
551
551
{
@@ -624,7 +624,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
624
624
},
625
625
},
626
626
wantErr : false ,
627
- wantMetrics : "" ,
627
+ wantMetrics : map [ string ] string {} ,
628
628
wantResponses : []* extProcPb.ProcessingResponse {
629
629
{
630
630
Response : & extProcPb.ProcessingResponse_ImmediateResponse {
@@ -668,11 +668,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
668
668
},
669
669
},
670
670
},
671
- wantMetrics : `
671
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
672
672
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
673
673
# TYPE inference_model_request_total counter
674
674
inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1
675
- ` ,
675
+ ` } ,
676
676
wantErr : false ,
677
677
wantResponses : []* extProcPb.ProcessingResponse {
678
678
{
@@ -776,11 +776,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
776
776
},
777
777
},
778
778
},
779
- wantMetrics : `
779
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
780
780
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
781
781
# TYPE inference_model_request_total counter
782
782
inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1
783
- ` ,
783
+ ` } ,
784
784
wantErr : false ,
785
785
wantResponses : []* extProcPb.ProcessingResponse {
786
786
{
@@ -884,11 +884,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
884
884
},
885
885
},
886
886
},
887
- wantMetrics : `
887
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
888
888
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
889
889
# TYPE inference_model_request_total counter
890
890
inference_model_request_total{model_name="direct-model",target_model_name="direct-model"} 1
891
- ` ,
891
+ ` } ,
892
892
wantErr : false ,
893
893
wantResponses : []* extProcPb.ProcessingResponse {
894
894
{
@@ -1186,19 +1186,47 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1186
1186
{
1187
1187
Request : & extProcPb.ProcessingRequest_ResponseBody {
1188
1188
ResponseBody : & extProcPb.HttpBody {
1189
- Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}` ),
1189
+ Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
1190
+ data: [DONE]` ,
1191
+ ),
1190
1192
EndOfStream : false },
1191
1193
},
1192
1194
},
1193
1195
{
1194
1196
Request : & extProcPb.ProcessingRequest_ResponseBody {
1195
1197
ResponseBody : & extProcPb.HttpBody {
1196
- Body : []byte ("data: [DONE] " ),
1198
+ Body : []byte ("" ),
1197
1199
EndOfStream : true },
1198
1200
},
1199
1201
},
1200
1202
},
1201
1203
wantErr : false ,
1204
+ wantMetrics : map [string ]string {`inference_model_input_tokens` : `
1205
+ # HELP inference_model_input_tokens [ALPHA] Inference model input token count distribution for requests in each model.
1206
+ # TYPE inference_model_input_tokens histogram
1207
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1"} 0
1208
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="8"} 1
1209
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="16"} 1
1210
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="32"} 1
1211
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="64"} 1
1212
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="128"} 1
1213
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="256"} 1
1214
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="512"} 1
1215
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1024"} 1
1216
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="2048"} 1
1217
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="4096"} 1
1218
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="8192"} 1
1219
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="16384"} 1
1220
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="32778"} 1
1221
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="65536"} 1
1222
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="131072"} 1
1223
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="262144"} 1
1224
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="524288"} 1
1225
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1.048576e+06"} 1
1226
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="+Inf"} 1
1227
+ inference_model_input_tokens_sum{model_name="",target_model_name=""} 7
1228
+ inference_model_input_tokens_count{model_name="",target_model_name=""} 1
1229
+ ` },
1202
1230
wantResponses : []* extProcPb.ProcessingResponse {
1203
1231
{
1204
1232
Response : & extProcPb.ProcessingResponse_ResponseHeaders {
@@ -1305,7 +1333,9 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1305
1333
BodyMutation : & extProcPb.BodyMutation {
1306
1334
Mutation : & extProcPb.BodyMutation_StreamedResponse {
1307
1335
StreamedResponse : & extProcPb.StreamedBodyResponse {
1308
- Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}` ),
1336
+ Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
1337
+ data: [DONE]` ,
1338
+ ),
1309
1339
EndOfStream : false ,
1310
1340
},
1311
1341
},
@@ -1321,7 +1351,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1321
1351
BodyMutation : & extProcPb.BodyMutation {
1322
1352
Mutation : & extProcPb.BodyMutation_StreamedResponse {
1323
1353
StreamedResponse : & extProcPb.StreamedBodyResponse {
1324
- Body : []byte ("data: [DONE] " ),
1354
+ Body : []byte ("" ),
1325
1355
EndOfStream : true ,
1326
1356
},
1327
1357
},
@@ -1347,9 +1377,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1347
1377
t .Errorf ("Unexpected response, (-want +got): %v" , diff )
1348
1378
}
1349
1379
1350
- if test .wantMetrics != "" {
1351
- if err := metricsutils .GatherAndCompare (legacyregistry .DefaultGatherer , strings .NewReader (test .wantMetrics ), "inference_model_request_total" ); err != nil {
1352
- t .Error (err )
1380
+ if len (test .wantMetrics ) != 0 {
1381
+ for metricName , value := range test .wantMetrics {
1382
+ if err := metricsutils .GatherAndCompare (legacyregistry .DefaultGatherer , strings .NewReader (value ), metricName ); err != nil {
1383
+ t .Error (err )
1384
+ }
1353
1385
}
1354
1386
}
1355
1387
0 commit comments