Skip to content

Commit 0e7d0b5

Browse files
authored
Merge branch 'master' into feature/install_in_cmake
2 parents ab8eb7e + 53bab8e commit 0e7d0b5

File tree

8 files changed

+270
-0
lines changed

8 files changed

+270
-0
lines changed
+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
name: Kubernetes Nightly Tests
2+
3+
on:
4+
workflow_dispatch:
5+
runs everyday at 6:15am
6+
schedule:
7+
- cron: '15 6 * * *'
8+
9+
jobs:
10+
kubernetes-tests:
11+
runs-on: [self-hosted, regression-test-gpu]
12+
steps:
13+
- name: Clean up previous run
14+
run: |
15+
echo "Cleaning up previous run"
16+
ls -la ./
17+
sudo rm -rf ./* || true
18+
sudo rm -rf ./.??* || true
19+
ls -la ./
20+
- name: Install minikube and kubectl
21+
run: |
22+
curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
23+
sudo install minikube-linux-amd64 /usr/local/bin/minikube
24+
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
25+
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
26+
echo "/usr/local/bin" >> $GITHUB_PATH
27+
- name: Setup Python 3.9
28+
uses: actions/setup-python@v5
29+
with:
30+
python-version: 3.9
31+
architecture: x64
32+
- name: Checkout TorchServe
33+
uses: actions/checkout@v3
34+
- name: Validate TorchServe
35+
run: ./kubernetes/tests/scripts/test_mnist.sh

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
![Benchmark Nightly](https://github.com/pytorch/serve/actions/workflows/benchmark_nightly.yml/badge.svg)
66
![Docker Regression Nightly](https://github.com/pytorch/serve/actions/workflows/regression_tests_docker.yml/badge.svg)
77
![KServe Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kserve_cpu_tests.yml/badge.svg)
8+
![Kubernetes Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kubernetes_tests.yml/badge.svg)
89

910
TorchServe is a flexible and easy-to-use tool for serving and scaling PyTorch models in production.
1011

docker/build_upload_release.py

+20
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@
3636
f"./build_image.sh -g -cv cu121 -t {organization}/torchserve:latest-gpu",
3737
dry_run,
3838
)
39+
try_and_handle(
40+
f"./build_image.sh -bt dev -cpp -t {organization}/torchserve:latest-cpp-dev-cpu",
41+
dry_run,
42+
)
43+
try_and_handle(
44+
f"./build_image.sh -bt dev -g -cv cu121 -cpp -t {organization}/torchserve:latest-cpp-dev-gpu",
45+
dry_run,
46+
)
3947
try_and_handle(
4048
f"docker tag {organization}/torchserve:latest {organization}/torchserve:latest-cpu",
4149
dry_run,
@@ -48,13 +56,25 @@
4856
f"docker tag {organization}/torchserve:latest-gpu {organization}/torchserve:{check_ts_version()}-gpu",
4957
dry_run,
5058
)
59+
try_and_handle(
60+
f"docker tag {organization}/torchserve:latest-cpp-dev-cpu {organization}/torchserve:{check_ts_version()}-cpp-dev-cpu",
61+
dry_run,
62+
)
63+
try_and_handle(
64+
f"docker tag {organization}/torchserve:latest-cpp-dev-gpu {organization}/torchserve:{check_ts_version()}-cpp-dev-gpu",
65+
dry_run,
66+
)
5167

5268
for image in [
5369
f"{organization}/torchserve:latest",
5470
f"{organization}/torchserve:latest-cpu",
5571
f"{organization}/torchserve:latest-gpu",
72+
f"{organization}/torchserve:latest-cpp-dev-cpu",
73+
f"{organization}/torchserve:latest-cpp-dev-gpu",
5674
f"{organization}/torchserve:{check_ts_version()}-cpu",
5775
f"{organization}/torchserve:{check_ts_version()}-gpu",
76+
f"{organization}/torchserve:{check_ts_version()}-cpp-dev-cpu",
77+
f"{organization}/torchserve:{check_ts_version()}-cpp-dev-gpu",
5878
]:
5979
try_and_handle(f"docker push {image}", dry_run)
6080

docker/docker_nightly.py

+22
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,29 @@
3535
project = "torchserve-nightly"
3636
cpu_version = f"{project}:cpu-{get_nightly_version()}"
3737
gpu_version = f"{project}:gpu-{get_nightly_version()}"
38+
cpp_dev_cpu_version = f"{project}:cpp-dev-cpu-{get_nightly_version()}"
39+
cpp_dev_gpu_version = f"{project}:cpp-dev-gpu-{get_nightly_version()}"
3840

3941
# Build Nightly images and append the date in the name
4042
try_and_handle(f"./build_image.sh -n -t {organization}/{cpu_version}", dry_run)
4143
try_and_handle(
4244
f"./build_image.sh -g -cv cu121 -n -t {organization}/{gpu_version}",
4345
dry_run,
4446
)
47+
try_and_handle(
48+
f"./build_image.sh -bt dev -cpp -t {organization}/{cpp_dev_cpu_version}",
49+
dry_run,
50+
)
51+
try_and_handle(
52+
f"./build_image.sh -bt dev -g -cv cu121 -cpp -t {organization}/{cpp_dev_gpu_version}",
53+
dry_run,
54+
)
4555

4656
# Push Nightly images to official PyTorch Dockerhub account
4757
try_and_handle(f"docker push {organization}/{cpu_version}", dry_run)
4858
try_and_handle(f"docker push {organization}/{gpu_version}", dry_run)
59+
try_and_handle(f"docker push {organization}/{cpp_dev_cpu_version}", dry_run)
60+
try_and_handle(f"docker push {organization}/{cpp_dev_gpu_version}", dry_run)
4961

5062
# Tag nightly images with latest
5163
try_and_handle(
@@ -56,10 +68,20 @@
5668
f"docker tag {organization}/{gpu_version} {organization}/{project}:latest-gpu",
5769
dry_run,
5870
)
71+
try_and_handle(
72+
f"docker tag {organization}/{cpp_dev_cpu_version} {organization}/{project}:latest-cpp-dev-cpu",
73+
dry_run,
74+
)
75+
try_and_handle(
76+
f"docker tag {organization}/{cpp_dev_gpu_version} {organization}/{project}:latest-cpp-dev-gpu",
77+
dry_run,
78+
)
5979

6080
# Push images with latest tag
6181
try_and_handle(f"docker push {organization}/{project}:latest-cpu", dry_run)
6282
try_and_handle(f"docker push {organization}/{project}:latest-gpu", dry_run)
83+
try_and_handle(f"docker push {organization}/{project}:latest-cpp-dev-cpu", dry_run)
84+
try_and_handle(f"docker push {organization}/{project}:latest-cpp-dev-gpu", dry_run)
6385

6486
# Cleanup built images
6587
if args.cleanup:
+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: ts-def
5+
labels:
6+
app: ts-def
7+
spec:
8+
replicas: 1
9+
selector:
10+
matchLabels:
11+
app: ts-def
12+
template:
13+
metadata:
14+
labels:
15+
app: ts-def
16+
spec:
17+
volumes:
18+
- name: model-store
19+
hostPath:
20+
path: /host/model_store
21+
containers:
22+
- name: torchserve
23+
image: pytorch/torchserve-nightly:latest-gpu
24+
ports:
25+
- containerPort: 8080
26+
- containerPort: 8081
27+
volumeMounts:
28+
- name: model-store
29+
mountPath: /home/model-server/model-store

kubernetes/tests/docker/Dockerfile

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM pytorch/torchserve-nightly:latest-gpu
2+
ARG EXAMPLE_DIR
3+
USER root
4+
5+
RUN apt-get update && apt-get install jq -y
6+
COPY $EXAMPLE_DIR/../docker/config.properties /home/model-server/config.properties
7+
USER model-server
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
inference_address=http://0.0.0.0:8080
2+
management_address=http://0.0.0.0:8081
3+
metrics_address=http://0.0.0.0:8082
4+
number_of_netty_threads=32
5+
job_queue_size=1000
6+
model_store=/home/model-server/model-store
7+
workflow_store=/home/model-server/wf-store
8+
system_metrics_cmd=ts/metrics/metric_collector.py --gpu 0
+148
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#!/usr/bin/env bash
2+
3+
set -o errexit -o nounset -o pipefail
4+
5+
ACCEPTABLE_CPU_CORE_USAGE=2
6+
DOCKER_IMAGE=pytorch/torchserve-nightly:latest-gpu
7+
8+
# Get relative path of example dir with respect to root
9+
# Ex: if ROOT_DIR is ~/serve , EXAMPLE_DIR is ./kubernetes/tests/scripts
10+
EXAMPLE_DIR=$(dirname "$(readlink -f "$0")")
11+
ROOT_DIR=${EXAMPLE_DIR}/../../../
12+
ROOT_DIR=$(realpath "$ROOT_DIR")
13+
EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|")
14+
15+
function start_minikube_cluster() {
16+
echo "Removing any previous Kubernetes cluster"
17+
minikube delete
18+
echo "Starting Kubernetes cluster"
19+
minikube start --gpus all --mount-string="$GITHUB_WORKSPACE:/host" --mount
20+
minikube addons enable metrics-server
21+
}
22+
23+
function build_docker_image() {
24+
eval $(minikube docker-env)
25+
docker system prune -f
26+
docker build -t $DOCKER_IMAGE --file $ROOT_DIR/$EXAMPLE_DIR/../docker/Dockerfile --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" .
27+
eval $(minikube docker-env -u)
28+
29+
}
30+
31+
function get_model_archive() {
32+
echo "Downloading archive for $2"
33+
mkdir model_store -p
34+
wget $1 -O model_store/"$2".mar
35+
pwd
36+
echo $GITHUB_WORKSPACE
37+
}
38+
39+
function deploy_cluster() {
40+
echo "Deploying the cluster"
41+
kubectl apply -f "$1"
42+
echo "Waiting for pod to come up..."
43+
wait_for_pod_running "$2" 300
44+
echo "Check status of the pod"
45+
kubectl describe pod "$2"
46+
}
47+
48+
function wait_for_pod_running() {
49+
pod_name="$1"
50+
max_wait_time="$2"
51+
interval=5
52+
start_time=$(date +%s)
53+
while true; do
54+
sleep "$interval"
55+
pod_description=$(kubectl describe pod "$pod_name")
56+
status_line=$(echo "$pod_description" | grep -E "Status:")
57+
pod_status=$(echo "$status_line" | awk '{print $2}')
58+
if [[ "$pod_status" == "Running" ]]; then
59+
break
60+
fi
61+
current_time=$(date +%s)
62+
if (( current_time - start_time >= max_wait_time )); then
63+
echo "Timeout waiting for pod $pod_name to become Running."
64+
delete_minikube_cluster
65+
exit 1
66+
fi
67+
done
68+
}
69+
70+
function delete_minikube_cluster() {
71+
echo "Delete cluster"
72+
minikube delete
73+
}
74+
75+
function check_cpu_cores {
76+
77+
start_time=$(date +%s)
78+
interval=10
79+
while true; do
80+
# Check if the Metrics API error message is present
81+
if ! kubectl top pod -l app=$1 | grep -q $1 ; then
82+
sleep "$interval"
83+
else
84+
echo "Wait for metrics output to stabilize"
85+
sleep 60
86+
break
87+
fi
88+
current_time=$(date +%s)
89+
if (( current_time - start_time >= $2 )); then
90+
echo "Timeout waiting for metrics information to be available"
91+
delete_minikube_cluster
92+
exit 1
93+
fi
94+
done
95+
# Get the CPU cores used by the pod
96+
pod_name=$(kubectl get pods -l app=$1 -o json | jq -r '.items[].metadata.name')
97+
cpu=$(kubectl top pod -l app=$1 | awk "/${pod_name}/{print \$2}")
98+
99+
# Check if the CPU cores exceed 2
100+
if [ $(echo "$cpu" | sed 's/m$//') -gt $ACCEPTABLE_CPU_CORE_USAGE ]; then
101+
echo "✘ Test failed: CPU cores $(echo "$cpu" | sed 's/m$//') for $pod_name exceeded $ACCEPTABLE_CPU_CORE_USAGE" >&2
102+
exit 1
103+
else
104+
echo "✓ SUCCESS"
105+
fi
106+
}
107+
108+
function make_cluster_accessible() {
109+
kubectl apply -f $1
110+
kubectl port-forward svc/ts-def 8080:8080 8081:8081 &
111+
sleep "$2"
112+
}
113+
114+
function cleanup_port_forwarding() {
115+
echo "Clean up port forwarding"
116+
pkill kubectl
117+
}
118+
119+
function make_prediction() {
120+
curl -X POST "localhost:8081/models?model_name=$1&url=$1.mar&initial_workers=1"
121+
PREDICTION=$(curl http://127.0.0.1:8080/predictions/$1 -T $2)
122+
EXPECTED="$3"
123+
if [ "${PREDICTION}" = "${EXPECTED}" ]; then
124+
echo "✓ SUCCESS"
125+
cleanup_port_forwarding
126+
else
127+
echo "✘ Test failed: Prediction: ${PREDICTION}, expected ${EXPECTED}."
128+
delete_minikube_cluster
129+
exit 1
130+
fi
131+
132+
}
133+
134+
# Setup
135+
start_minikube_cluster
136+
build_docker_image
137+
get_model_archive "https://torchserve.pytorch.org/mar_files/mnist_v2.mar" "mnist"
138+
deploy_cluster "./kubernetes/tests/configs/deployment.yaml" "ts-def"
139+
140+
echo "No model loaded CPU usage test"
141+
check_cpu_cores "ts-def" 180
142+
143+
echo "MNIST test inference"
144+
make_cluster_accessible "kubernetes/examples/mnist/service.yaml" 5
145+
make_prediction "mnist" "examples/image_classifier/mnist/test_data/0.png" "0"
146+
147+
# Clean up
148+
delete_minikube_cluster

0 commit comments

Comments
 (0)