diff --git a/.github/workflows/kubernetes_tests.yml b/.github/workflows/kubernetes_tests.yml new file mode 100644 index 0000000000..3eab094b6a --- /dev/null +++ b/.github/workflows/kubernetes_tests.yml @@ -0,0 +1,35 @@ +name: Kubernetes Nightly Tests + +on: + workflow_dispatch: + runs everyday at 6:15am + schedule: + - cron: '15 6 * * *' + +jobs: + kubernetes-tests: + runs-on: [self-hosted, regression-test-gpu] + steps: + - name: Clean up previous run + run: | + echo "Cleaning up previous run" + ls -la ./ + sudo rm -rf ./* || true + sudo rm -rf ./.??* || true + ls -la ./ + - name: Install minikube and kubectl + run: | + curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 + sudo install minikube-linux-amd64 /usr/local/bin/minikube + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl + echo "/usr/local/bin" >> $GITHUB_PATH + - name: Setup Python 3.9 + uses: actions/setup-python@v5 + with: + python-version: 3.9 + architecture: x64 + - name: Checkout TorchServe + uses: actions/checkout@v3 + - name: Validate TorchServe + run: ./kubernetes/tests/scripts/test_mnist.sh diff --git a/README.md b/README.md index a4aa612b07..6c7c0cf9a2 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ ![Benchmark Nightly](https://github.com/pytorch/serve/actions/workflows/benchmark_nightly.yml/badge.svg) ![Docker Regression Nightly](https://github.com/pytorch/serve/actions/workflows/regression_tests_docker.yml/badge.svg) ![KServe Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kserve_cpu_tests.yml/badge.svg) +![Kubernetes Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kubernetes_tests.yml/badge.svg) TorchServe is a flexible and easy-to-use tool for serving and scaling PyTorch models in production. diff --git a/kubernetes/tests/configs/deployment.yaml b/kubernetes/tests/configs/deployment.yaml new file mode 100644 index 0000000000..8d593b1510 --- /dev/null +++ b/kubernetes/tests/configs/deployment.yaml @@ -0,0 +1,29 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ts-def + labels: + app: ts-def +spec: + replicas: 1 + selector: + matchLabels: + app: ts-def + template: + metadata: + labels: + app: ts-def + spec: + volumes: + - name: model-store + hostPath: + path: /host/model_store + containers: + - name: torchserve + image: pytorch/torchserve-nightly:latest-gpu + ports: + - containerPort: 8080 + - containerPort: 8081 + volumeMounts: + - name: model-store + mountPath: /home/model-server/model-store diff --git a/kubernetes/tests/docker/Dockerfile b/kubernetes/tests/docker/Dockerfile new file mode 100644 index 0000000000..b31f5752db --- /dev/null +++ b/kubernetes/tests/docker/Dockerfile @@ -0,0 +1,7 @@ +FROM pytorch/torchserve-nightly:latest-gpu +ARG EXAMPLE_DIR +USER root + +RUN apt-get update && apt-get install jq -y +COPY $EXAMPLE_DIR/../docker/config.properties /home/model-server/config.properties +USER model-server diff --git a/kubernetes/tests/docker/config.properties b/kubernetes/tests/docker/config.properties new file mode 100644 index 0000000000..d1d199dabb --- /dev/null +++ b/kubernetes/tests/docker/config.properties @@ -0,0 +1,8 @@ +inference_address=http://0.0.0.0:8080 +management_address=http://0.0.0.0:8081 +metrics_address=http://0.0.0.0:8082 +number_of_netty_threads=32 +job_queue_size=1000 +model_store=/home/model-server/model-store +workflow_store=/home/model-server/wf-store +system_metrics_cmd=ts/metrics/metric_collector.py --gpu 0 diff --git a/kubernetes/tests/scripts/test_mnist.sh b/kubernetes/tests/scripts/test_mnist.sh new file mode 100755 index 0000000000..9e0603c2f8 --- /dev/null +++ b/kubernetes/tests/scripts/test_mnist.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash + +set -o errexit -o nounset -o pipefail + +ACCEPTABLE_CPU_CORE_USAGE=2 +DOCKER_IMAGE=pytorch/torchserve-nightly:latest-gpu + +# Get relative path of example dir with respect to root +# Ex: if ROOT_DIR is ~/serve , EXAMPLE_DIR is ./kubernetes/tests/scripts +EXAMPLE_DIR=$(dirname "$(readlink -f "$0")") +ROOT_DIR=${EXAMPLE_DIR}/../../../ +ROOT_DIR=$(realpath "$ROOT_DIR") +EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|") + +function start_minikube_cluster() { + echo "Removing any previous Kubernetes cluster" + minikube delete + echo "Starting Kubernetes cluster" + minikube start --gpus all --mount-string="$GITHUB_WORKSPACE:/host" --mount + minikube addons enable metrics-server +} + +function build_docker_image() { + eval $(minikube docker-env) + docker system prune -f + docker build -t $DOCKER_IMAGE --file $ROOT_DIR/$EXAMPLE_DIR/../docker/Dockerfile --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" . + eval $(minikube docker-env -u) + +} + +function get_model_archive() { + echo "Downloading archive for $2" + mkdir model_store -p + wget $1 -O model_store/"$2".mar + pwd + echo $GITHUB_WORKSPACE +} + +function deploy_cluster() { + echo "Deploying the cluster" + kubectl apply -f "$1" + echo "Waiting for pod to come up..." + wait_for_pod_running "$2" 300 + echo "Check status of the pod" + kubectl describe pod "$2" +} + +function wait_for_pod_running() { + pod_name="$1" + max_wait_time="$2" + interval=5 + start_time=$(date +%s) + while true; do + sleep "$interval" + pod_description=$(kubectl describe pod "$pod_name") + status_line=$(echo "$pod_description" | grep -E "Status:") + pod_status=$(echo "$status_line" | awk '{print $2}') + if [[ "$pod_status" == "Running" ]]; then + break + fi + current_time=$(date +%s) + if (( current_time - start_time >= max_wait_time )); then + echo "Timeout waiting for pod $pod_name to become Running." + delete_minikube_cluster + exit 1 + fi + done +} + +function delete_minikube_cluster() { + echo "Delete cluster" + minikube delete +} + +function check_cpu_cores { + + start_time=$(date +%s) + interval=10 + while true; do + # Check if the Metrics API error message is present + if ! kubectl top pod -l app=$1 | grep -q $1 ; then + sleep "$interval" + else + echo "Wait for metrics output to stabilize" + sleep 60 + break + fi + current_time=$(date +%s) + if (( current_time - start_time >= $2 )); then + echo "Timeout waiting for metrics information to be available" + delete_minikube_cluster + exit 1 + fi + done + # Get the CPU cores used by the pod + pod_name=$(kubectl get pods -l app=$1 -o json | jq -r '.items[].metadata.name') + cpu=$(kubectl top pod -l app=$1 | awk "/${pod_name}/{print \$2}") + + # Check if the CPU cores exceed 2 + if [ $(echo "$cpu" | sed 's/m$//') -gt $ACCEPTABLE_CPU_CORE_USAGE ]; then + echo "✘ Test failed: CPU cores $(echo "$cpu" | sed 's/m$//') for $pod_name exceeded $ACCEPTABLE_CPU_CORE_USAGE" >&2 + exit 1 + else + echo "✓ SUCCESS" + fi +} + +function make_cluster_accessible() { +kubectl apply -f $1 +kubectl port-forward svc/ts-def 8080:8080 8081:8081 & +sleep "$2" +} + +function cleanup_port_forwarding() { + echo "Clean up port forwarding" + pkill kubectl +} + +function make_prediction() { +curl -X POST "localhost:8081/models?model_name=$1&url=$1.mar&initial_workers=1" +PREDICTION=$(curl http://127.0.0.1:8080/predictions/$1 -T $2) +EXPECTED="$3" +if [ "${PREDICTION}" = "${EXPECTED}" ]; then + echo "✓ SUCCESS" + cleanup_port_forwarding +else + echo "✘ Test failed: Prediction: ${PREDICTION}, expected ${EXPECTED}." + delete_minikube_cluster + exit 1 +fi + +} + +# Setup +start_minikube_cluster +build_docker_image +get_model_archive "https://torchserve.pytorch.org/mar_files/mnist_v2.mar" "mnist" +deploy_cluster "./kubernetes/tests/configs/deployment.yaml" "ts-def" + +echo "No model loaded CPU usage test" +check_cpu_cores "ts-def" 180 + +echo "MNIST test inference" +make_cluster_accessible "kubernetes/examples/mnist/service.yaml" 5 +make_prediction "mnist" "examples/image_classifier/mnist/test_data/0.png" "0" + +# Clean up +delete_minikube_cluster