File tree 3 files changed +30
-83
lines changed
3 files changed +30
-83
lines changed Original file line number Diff line number Diff line change 56
56
- name : Clean up weights
57
57
run : |
58
58
rm -rf /home/ubuntu/Llama-2-7b-chat-hf
59
- - name : Save benchmark artifacts
60
- uses : actions/upload-artifact@v2
61
- with :
62
- name : nightly ${{ matrix.hardware }} artifact
63
- path : /tmp/ts_benchmark
64
- - name : Download benchmark artifacts for auto validation
65
- uses : dawidd6/action-download-artifact@v2
66
- with :
67
- workflow : ${{ github.event.workflow_run.workflow_id }}
68
- workflow_conclusion : success
69
- if_no_artifact_found : ignore
70
- path : /tmp/ts_artifacts
71
- name : ${{ matrix.hardware }}_benchmark_validation
72
- - name : Validate Benchmark result
73
- run : python benchmarks/validate_report.py --input-artifacts-dir /tmp/ts_artifacts/${{ matrix.hardware }}_benchmark_validation
74
- - name : Update benchmark artifacts for auto validation
75
- run : python benchmarks/utils/update_artifacts.py --output /tmp/ts_artifacts/${{ matrix.hardware }}_benchmark_validation
76
- - name : Upload the updated benchmark artifacts for auto validation
77
- uses : actions/upload-artifact@v2
78
- with :
79
- name : ${{ matrix.hardware }}_benchmark_validation
80
- path : /tmp/ts_artifacts
59
+
Original file line number Diff line number Diff line change @@ -9,9 +9,8 @@ ts_version:
9
9
# a list of model configure yaml files defined in benchmarks/models_config
10
10
# or a list of model configure yaml files with full path
11
11
models :
12
- # - "llama-2-7b.yaml"
13
- # - "llama-2-13b.yaml"
14
- - " llama-2-70b.yaml"
12
+ - " llama-2-7b.yaml"
13
+ # - "llama-2-70b.yaml"
15
14
16
15
# benchmark on "cpu" or "gpu".
17
16
# "cpu" is set if "hardware" is not specified
@@ -24,11 +23,11 @@ hardware: &hardware "gpu"
24
23
# - keep the values order as the same as the command definition.
25
24
# - set up the command before enabling `metrics_cmd`.
26
25
# For example, aws client and AWS credentials need to be setup before trying this example.
27
- # metrics_cmd:
28
- # - "cmd": "aws cloudwatch put-metric-data"
29
- # - "--namespace": ["torchserve_benchmark_nightly_", *hardware ]
30
- # - "--region": "us-east-2"
31
- # - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
26
+ metrics_cmd :
27
+ - " cmd " : " aws cloudwatch put-metric-data"
28
+ - " --namespace " : ["torchserve_benchmark_nightly_lmi" ]
29
+ - " --region " : " us-east-2"
30
+ - " --metric-data " : ' file:///tmp/benchmark/logs/stats_metrics.json'
32
31
33
32
# load report to remote storage or local different path if "report_cmd" is set.
34
33
# the command line to load report to remote storage.
Original file line number Diff line number Diff line change 1
1
---
2
2
llama-2-7b :
3
- int8 :
4
- benchmark_engine : " ab"
5
- url : https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int8.mar
6
- workers :
7
- - 1
8
- batch_delay : 100
9
- batch_size :
10
- - 1
11
- input : " ./examples/large_models/gpt_fast/request.json"
12
- requests : 1000
13
- concurrency : 1
14
- backend_profiling : False
15
- exec_env : " local"
16
- processors :
17
- - " gpus " : " all"
18
- stream : " false"
19
- int4 :
20
- benchmark_engine : " ab"
21
- url : https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int4.mar
22
- workers :
23
- - 1
24
- batch_delay : 100
25
- batch_size :
26
- - 1
27
- input : " ./examples/large_models/gpt_fast/request.json"
28
- requests : 1000
29
- concurrency : 1
30
- backend_profiling : False
31
- exec_env : " local"
32
- processors :
33
- - " gpus " : " all"
34
- stream : " false"
35
- # base:
3
+ # int8:
36
4
# benchmark_engine: "ab"
37
- # url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-base .mar
5
+ # url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int8 .mar
38
6
# workers:
39
7
# - 1
40
8
# batch_delay: 100
@@ -48,36 +16,37 @@ llama-2-7b:
48
16
# processors:
49
17
# - "gpus": "all"
50
18
# stream: "false"
51
- # int8-tp :
19
+ # int4 :
52
20
# benchmark_engine: "ab"
53
- # url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/llama-2-7b-int8-tp .mar
21
+ # url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/ llama-2-7b-int4 .mar
54
22
# workers:
55
- # - 4
56
- # batch_delay: 100
57
- # batch_size:
58
23
# - 1
59
- # input: "./examples/large_models/gpt_fast/request.json"
60
- # requests: 1000
61
- # concurrency: 4
62
- # backend_profiling: False
63
- # exec_env: "local"
64
- # processors:
65
- # - "gpus": "all"
66
- # stream: "false"
67
- # int4-tp:
68
- # benchmark_engine: "ab"
69
- # url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/llama-2-7b-int4-tp.mar
70
- # workers:
71
- # - 4
72
24
# batch_delay: 100
73
25
# batch_size:
74
26
# - 1
75
27
# input: "./examples/large_models/gpt_fast/request.json"
76
28
# requests: 1000
77
- # concurrency: 4
29
+ # concurrency: 1
78
30
# backend_profiling: False
79
31
# exec_env: "local"
80
32
# processors:
81
33
# - "gpus": "all"
82
34
# stream: "false"
35
+ base :
36
+ benchmark_engine : " ab"
37
+ url : https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-base.mar
38
+ workers :
39
+ - 1
40
+ batch_delay : 100
41
+ batch_size :
42
+ - 1
43
+ input : " ./examples/large_models/gpt_fast/request.json"
44
+ requests : 1000
45
+ concurrency : 1
46
+ backend_profiling : False
47
+ exec_env : " local"
48
+ processors :
49
+ - " gpus " : " all"
50
+ stream : " false"
51
+
83
52
You can’t perform that action at this time.
0 commit comments