Skip to content

Commit d633e19

Browse files
committed
Kubernetes nightly tests
1 parent 9fda9bd commit d633e19

File tree

5 files changed

+225
-0
lines changed

5 files changed

+225
-0
lines changed
+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Kubernetes Nightly Tests
2+
3+
on:
4+
push
5+
#workflow_dispatch:
6+
# runs everyday at 5:15am
7+
#schedule:
8+
# - cron: '15 6 * * *'
9+
10+
jobs:
11+
kserve-cpu-tests:
12+
runs-on: [self-hosted, regression-test-gpu]
13+
steps:
14+
- name: Clean up previous run
15+
run: |
16+
echo "Cleaning up previous run"
17+
ls -la ./
18+
sudo rm -rf ./* || true
19+
sudo rm -rf ./.??* || true
20+
ls -la ./
21+
- name: Install minikube and kubectl
22+
run: |
23+
curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
24+
sudo install minikube-linux-amd64 /usr/local/bin/minikube
25+
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
26+
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
27+
echo "/usr/local/bin" >> $GITHUB_PATH
28+
minikube addons enable metrics-server
29+
- name: Setup Python 3.9
30+
uses: actions/setup-python@v5
31+
with:
32+
python-version: 3.9
33+
architecture: x64
34+
- name: Checkout TorchServe
35+
uses: actions/checkout@v3
36+
- name: Validate TorchServe
37+
run: /kubernetes/tests/scripts/test_mnist.sh
+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: ts-def
5+
labels:
6+
app: ts-def
7+
spec:
8+
replicas: 1
9+
selector:
10+
matchLabels:
11+
app: ts-def
12+
template:
13+
metadata:
14+
labels:
15+
app: ts-def
16+
spec:
17+
volumes:
18+
- name: model-store
19+
hostPath:
20+
path: /host/model_store
21+
containers:
22+
- name: torchserve
23+
image: pytorch/torchserve-nightly:latest-gpu
24+
ports:
25+
- containerPort: 8080
26+
- containerPort: 8081
27+
volumeMounts:
28+
- name: model-store
29+
mountPath: /home/model-server/model-store

kubernetes/tests/docker/Dockerfile

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FROM pytorch/torchserve-nightly:latest-gpu
2+
ARG EXAMPLE_DIR
3+
4+
RUN apt-get install jq -y
5+
6+
COPY $EXAMPLE_DIR/../docker/config.properties /home/model-server/config.properties
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
inference_address=http://0.0.0.0:8080
2+
management_address=http://0.0.0.0:8081
3+
metrics_address=http://0.0.0.0:8082
4+
number_of_netty_threads=32
5+
job_queue_size=1000
6+
model_store=/home/model-server/model-store
7+
workflow_store=/home/model-server/wf-store
8+
system_metrics_cmd=ts/metrics/metric_collector.py --gpu 0
+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#!/usr/bin/env bash
2+
3+
set -o errexit -o nounset -o pipefail
4+
5+
ACCEPTABLE_CPU_CORE_USAGE=2
6+
DOCKER_IMAGE=pytorch/torchserve-nightly:latest-gpu
7+
8+
# Get relative path of example dir
9+
EXAMPLE_DIR=$(dirname "$(readlink -f "$0")")
10+
ROOT_DIR=${EXAMPLE_DIR}/../../../
11+
ROOT_DIR=$(realpath "$ROOT_DIR")
12+
EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|")
13+
14+
function start_minikube_cluster() {
15+
echo "Removing any previous Kubernetes cluster"
16+
minikube delete
17+
echo "Starting Kubernetes cluster"
18+
minikube start --gpus all --mount-string="$HOME/serve:/host" --mount
19+
minikube addons enable metrics-server
20+
}
21+
22+
function build_docker_image() {
23+
eval $(minikube docker-env)
24+
docker system prune -f
25+
docker build -t $DOCKER_IMAGE --file $ROOT_DIR/$EXAMPLE_DIR/../docker/Dockerfile --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" .
26+
eval $(minikube docker-env -u)
27+
28+
}
29+
30+
function get_model_archive() {
31+
echo "Downloading archive for $2"
32+
mkdir model_store -p
33+
wget $1 -O model_store/"$2".mar
34+
}
35+
36+
function deploy_cluster() {
37+
echo "Deploying the cluster"
38+
#cd $GITHUB_WORKSPACE
39+
kubectl apply -f "$1"
40+
echo "Waiting for pod to come up..."
41+
wait_for_pod_running "$2" 300
42+
echo "Check status of the pod"
43+
kubectl describe pod "$2"
44+
}
45+
46+
function wait_for_pod_running() {
47+
pod_name="$1"
48+
max_wait_time="$2"
49+
interval=5
50+
start_time=$(date +%s)
51+
while true; do
52+
sleep "$interval"
53+
pod_description=$(kubectl describe pod "$pod_name")
54+
status_line=$(echo "$pod_description" | grep -E "Status:")
55+
pod_status=$(echo "$status_line" | awk '{print $2}')
56+
if [[ "$pod_status" == "Running" ]]; then
57+
break
58+
fi
59+
current_time=$(date +%s)
60+
if (( current_time - start_time >= max_wait_time )); then
61+
echo "Timeout waiting for pod $pod_name to become Running."
62+
delete_minikube_cluster
63+
exit 1
64+
fi
65+
done
66+
}
67+
68+
function delete_minikube_cluster() {
69+
echo "Delete cluster"
70+
minikube delete
71+
}
72+
73+
function check_cpu_cores {
74+
75+
start_time=$(date +%s)
76+
interval=10
77+
while true; do
78+
# Check if the Metrics API error message is present
79+
if ! kubectl top pod -l app=$1 | grep -q $1 ; then
80+
sleep "$interval"
81+
else
82+
echo "Wait for metrics output to stabilize"
83+
sleep 60
84+
break
85+
fi
86+
if (( current_time - start_time >= max_wait_time )); then
87+
echo "Timeout waiting for metrics information to be available"
88+
delete_minikube_cluster
89+
exit 1
90+
fi
91+
done
92+
# Get the CPU cores used by the pod
93+
pod_name=$(kubectl get pods -l app=$1 -o json | jq -r '.items[].metadata.name')
94+
cpu=$(kubectl top pod -l app=$1 | awk "/${pod_name}/{print \$2}")
95+
96+
# Check if the CPU cores exceed 2
97+
if [ $(echo "$cpu" | sed 's/m$//') -gt $ACCEPTABLE_CPU_CORE_USAGE ]; then
98+
echo "✘ Test failed: CPU cores $(echo "$cpu" | sed 's/m$//') for $pod_name exceeded $ACCEPTABLE_CPU_CORE_USAGE" >&2
99+
exit 1
100+
else
101+
echo "✓ SUCCESS"
102+
fi
103+
}
104+
105+
function make_cluster_accessible() {
106+
kubectl apply -f $1
107+
kubectl port-forward svc/ts-def 8080:8080 8081:8081 &
108+
sleep "$2"
109+
}
110+
111+
function cleanup_port_forwarding() {
112+
echo "Clean up port forwarding"
113+
pkill kubectl
114+
}
115+
116+
function make_prediction() {
117+
curl -X POST "localhost:8081/models?model_name=$1&url=$1.mar&initial_workers=1"
118+
PREDICTION=$(curl http://127.0.0.1:8080/predictions/$1 -T $2)
119+
EXPECTED="$3"
120+
if [ "${PREDICTION}" = "${EXPECTED}" ]; then
121+
echo "✓ SUCCESS"
122+
cleanup_port_forwarding
123+
else
124+
echo "✘ Test failed: Prediction: ${PREDICTION}, expected ${EXPECTED}."
125+
delete_minikube_cluster
126+
exit 1
127+
fi
128+
129+
}
130+
131+
# Setup
132+
#start_minikube_cluster
133+
#build_docker_image
134+
#get_model_archive "https://torchserve.pytorch.org/mar_files/mnist_v2.mar" "mnist"
135+
deploy_cluster "./kubernetes/tests/configs/deployment.yaml" "ts-def"
136+
137+
echo "CPU usage test"
138+
check_cpu_cores "ts-def"
139+
140+
echo "MNIST test inference"
141+
make_cluster_accessible "kubernetes/examples/mnist/service.yaml" 5
142+
make_prediction "mnist" "examples/image_classifier/mnist/test_data/0.png" "0"
143+
144+
# Clean up
145+
#delete_minikube_cluster

0 commit comments

Comments
 (0)