@@ -387,7 +387,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
387
387
requests []* extProcPb.ProcessingRequest
388
388
pods map [backendmetrics.Pod ]* backendmetrics.Metrics
389
389
wantResponses []* extProcPb.ProcessingResponse
390
- wantMetrics string
390
+ wantMetrics map [ string ] string
391
391
wantErr bool
392
392
immediateResponse * extProcPb.ImmediateResponse
393
393
}{
@@ -410,11 +410,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
410
410
KVCacheUsagePercent : 0.2 ,
411
411
},
412
412
},
413
- wantMetrics : `
413
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
414
414
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
415
415
# TYPE inference_model_request_total counter
416
416
inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1
417
- ` ,
417
+ ` } ,
418
418
wantErr : false ,
419
419
wantResponses : []* extProcPb.ProcessingResponse {
420
420
{
@@ -491,11 +491,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
491
491
},
492
492
},
493
493
},
494
- wantMetrics : `
494
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
495
495
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
496
496
# TYPE inference_model_request_total counter
497
497
inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1
498
- ` ,
498
+ ` } ,
499
499
wantErr : false ,
500
500
wantResponses : []* extProcPb.ProcessingResponse {
501
501
{
@@ -572,11 +572,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
572
572
},
573
573
},
574
574
},
575
- wantMetrics : `
575
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
576
576
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
577
577
# TYPE inference_model_request_total counter
578
578
inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1
579
- ` ,
579
+ ` } ,
580
580
wantErr : false ,
581
581
wantResponses : []* extProcPb.ProcessingResponse {
582
582
{
@@ -655,7 +655,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
655
655
},
656
656
},
657
657
wantErr : false ,
658
- wantMetrics : "" ,
658
+ wantMetrics : map [ string ] string {} ,
659
659
wantResponses : []* extProcPb.ProcessingResponse {
660
660
{
661
661
Response : & extProcPb.ProcessingResponse_ImmediateResponse {
@@ -699,11 +699,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
699
699
},
700
700
},
701
701
},
702
- wantMetrics : `
702
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
703
703
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
704
704
# TYPE inference_model_request_total counter
705
705
inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1
706
- ` ,
706
+ ` } ,
707
707
wantErr : false ,
708
708
wantResponses : []* extProcPb.ProcessingResponse {
709
709
{
@@ -807,11 +807,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
807
807
},
808
808
},
809
809
},
810
- wantMetrics : `
810
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
811
811
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
812
812
# TYPE inference_model_request_total counter
813
813
inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1
814
- ` ,
814
+ ` } ,
815
815
wantErr : false ,
816
816
wantResponses : []* extProcPb.ProcessingResponse {
817
817
{
@@ -915,11 +915,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
915
915
},
916
916
},
917
917
},
918
- wantMetrics : `
918
+ wantMetrics : map [ string ] string { `inference_model_request_total` : `
919
919
# HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model.
920
920
# TYPE inference_model_request_total counter
921
921
inference_model_request_total{model_name="direct-model",target_model_name="direct-model"} 1
922
- ` ,
922
+ ` } ,
923
923
wantErr : false ,
924
924
wantResponses : []* extProcPb.ProcessingResponse {
925
925
{
@@ -1217,19 +1217,47 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1217
1217
{
1218
1218
Request : & extProcPb.ProcessingRequest_ResponseBody {
1219
1219
ResponseBody : & extProcPb.HttpBody {
1220
- Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}` ),
1220
+ Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
1221
+ data: [DONE]` ,
1222
+ ),
1221
1223
EndOfStream : false },
1222
1224
},
1223
1225
},
1224
1226
{
1225
1227
Request : & extProcPb.ProcessingRequest_ResponseBody {
1226
1228
ResponseBody : & extProcPb.HttpBody {
1227
- Body : []byte ("data: [DONE] " ),
1229
+ Body : []byte ("" ),
1228
1230
EndOfStream : true },
1229
1231
},
1230
1232
},
1231
1233
},
1232
1234
wantErr : false ,
1235
+ wantMetrics : map [string ]string {`inference_model_input_tokens` : `
1236
+ # HELP inference_model_input_tokens [ALPHA] Inference model input token count distribution for requests in each model.
1237
+ # TYPE inference_model_input_tokens histogram
1238
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1"} 0
1239
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="8"} 1
1240
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="16"} 1
1241
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="32"} 1
1242
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="64"} 1
1243
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="128"} 1
1244
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="256"} 1
1245
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="512"} 1
1246
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1024"} 1
1247
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="2048"} 1
1248
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="4096"} 1
1249
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="8192"} 1
1250
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="16384"} 1
1251
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="32778"} 1
1252
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="65536"} 1
1253
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="131072"} 1
1254
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="262144"} 1
1255
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="524288"} 1
1256
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="1.048576e+06"} 1
1257
+ inference_model_input_tokens_bucket{model_name="",target_model_name="",le="+Inf"} 1
1258
+ inference_model_input_tokens_sum{model_name="",target_model_name=""} 7
1259
+ inference_model_input_tokens_count{model_name="",target_model_name=""} 1
1260
+ ` },
1233
1261
wantResponses : []* extProcPb.ProcessingResponse {
1234
1262
{
1235
1263
Response : & extProcPb.ProcessingResponse_ResponseHeaders {
@@ -1336,7 +1364,9 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1336
1364
BodyMutation : & extProcPb.BodyMutation {
1337
1365
Mutation : & extProcPb.BodyMutation_StreamedResponse {
1338
1366
StreamedResponse : & extProcPb.StreamedBodyResponse {
1339
- Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}` ),
1367
+ Body : []byte (`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
1368
+ data: [DONE]` ,
1369
+ ),
1340
1370
EndOfStream : false ,
1341
1371
},
1342
1372
},
@@ -1352,7 +1382,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1352
1382
BodyMutation : & extProcPb.BodyMutation {
1353
1383
Mutation : & extProcPb.BodyMutation_StreamedResponse {
1354
1384
StreamedResponse : & extProcPb.StreamedBodyResponse {
1355
- Body : []byte ("data: [DONE] " ),
1385
+ Body : []byte ("" ),
1356
1386
EndOfStream : true ,
1357
1387
},
1358
1388
},
@@ -1378,9 +1408,11 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) {
1378
1408
t .Errorf ("Unexpected response, (-want +got): %v" , diff )
1379
1409
}
1380
1410
1381
- if test .wantMetrics != "" {
1382
- if err := metricsutils .GatherAndCompare (legacyregistry .DefaultGatherer , strings .NewReader (test .wantMetrics ), "inference_model_request_total" ); err != nil {
1383
- t .Error (err )
1411
+ if len (test .wantMetrics ) != 0 {
1412
+ for metricName , value := range test .wantMetrics {
1413
+ if err := metricsutils .GatherAndCompare (legacyregistry .DefaultGatherer , strings .NewReader (value ), metricName ); err != nil {
1414
+ t .Error (err )
1415
+ }
1384
1416
}
1385
1417
}
1386
1418
0 commit comments