Merge branch 'master' into feature/install_in_cmake

mreso · web-flow · commit 0e7d0b5d8dea · 2024-03-13T22:40:11.000-07:00
diff --git a/.github/workflows/kubernetes_tests.yml b/.github/workflows/kubernetes_tests.yml
@@ -0,0 +1,35 @@
+name: Kubernetes Nightly Tests
+
+on:
+  workflow_dispatch:
+   runs everyday  at 6:15am
+  schedule:
+    - cron:  '15 6 * * *'
+
+jobs:
+  kubernetes-tests:
+    runs-on: [self-hosted, regression-test-gpu]
+    steps:
+      - name: Clean up previous run
+        run: |
+          echo "Cleaning up previous run"
+          ls -la ./
+          sudo rm -rf ./* || true
+          sudo rm -rf ./.??* || true
+          ls -la ./
+      - name: Install minikube and kubectl
+        run: |
+          curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
+          sudo install minikube-linux-amd64 /usr/local/bin/minikube
+          curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+          sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+          echo "/usr/local/bin" >> $GITHUB_PATH
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+          architecture: x64
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+      - name: Validate TorchServe
+        run: ./kubernetes/tests/scripts/test_mnist.sh
diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@
 ![Benchmark Nightly](https://github.com/pytorch/serve/actions/workflows/benchmark_nightly.yml/badge.svg)
 ![Docker Regression Nightly](https://github.com/pytorch/serve/actions/workflows/regression_tests_docker.yml/badge.svg)
 ![KServe Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kserve_cpu_tests.yml/badge.svg)
+![Kubernetes Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kubernetes_tests.yml/badge.svg)
 
 TorchServe is a flexible and easy-to-use tool for serving and scaling PyTorch models in production.
 
diff --git a/docker/build_upload_release.py b/docker/build_upload_release.py
@@ -36,6 +36,14 @@
         f"./build_image.sh -g -cv cu121 -t {organization}/torchserve:latest-gpu",
         dry_run,
     )
+    try_and_handle(
+        f"./build_image.sh -bt dev -cpp -t {organization}/torchserve:latest-cpp-dev-cpu",
+        dry_run,
+    )
+    try_and_handle(
+        f"./build_image.sh -bt dev -g -cv cu121 -cpp -t {organization}/torchserve:latest-cpp-dev-gpu",
+        dry_run,
+    )
     try_and_handle(
         f"docker tag {organization}/torchserve:latest {organization}/torchserve:latest-cpu",
         dry_run,
@@ -48,13 +56,25 @@
         f"docker tag {organization}/torchserve:latest-gpu {organization}/torchserve:{check_ts_version()}-gpu",
         dry_run,
     )
+    try_and_handle(
+        f"docker tag {organization}/torchserve:latest-cpp-dev-cpu {organization}/torchserve:{check_ts_version()}-cpp-dev-cpu",
+        dry_run,
+    )
+    try_and_handle(
+        f"docker tag {organization}/torchserve:latest-cpp-dev-gpu {organization}/torchserve:{check_ts_version()}-cpp-dev-gpu",
+        dry_run,
+    )
 
     for image in [
         f"{organization}/torchserve:latest",
         f"{organization}/torchserve:latest-cpu",
         f"{organization}/torchserve:latest-gpu",
+        f"{organization}/torchserve:latest-cpp-dev-cpu",
+        f"{organization}/torchserve:latest-cpp-dev-gpu",
         f"{organization}/torchserve:{check_ts_version()}-cpu",
         f"{organization}/torchserve:{check_ts_version()}-gpu",
+        f"{organization}/torchserve:{check_ts_version()}-cpp-dev-cpu",
+        f"{organization}/torchserve:{check_ts_version()}-cpp-dev-gpu",
     ]:
         try_and_handle(f"docker push {image}", dry_run)
 
diff --git a/docker/docker_nightly.py b/docker/docker_nightly.py
@@ -35,17 +35,29 @@
     project = "torchserve-nightly"
     cpu_version = f"{project}:cpu-{get_nightly_version()}"
     gpu_version = f"{project}:gpu-{get_nightly_version()}"
+    cpp_dev_cpu_version = f"{project}:cpp-dev-cpu-{get_nightly_version()}"
+    cpp_dev_gpu_version = f"{project}:cpp-dev-gpu-{get_nightly_version()}"
 
     # Build Nightly images and append the date in the name
     try_and_handle(f"./build_image.sh -n -t {organization}/{cpu_version}", dry_run)
     try_and_handle(
         f"./build_image.sh -g -cv cu121 -n -t {organization}/{gpu_version}",
         dry_run,
     )
+    try_and_handle(
+        f"./build_image.sh -bt dev -cpp -t {organization}/{cpp_dev_cpu_version}",
+        dry_run,
+    )
+    try_and_handle(
+        f"./build_image.sh -bt dev -g -cv cu121 -cpp -t {organization}/{cpp_dev_gpu_version}",
+        dry_run,
+    )
 
     # Push Nightly images to official PyTorch Dockerhub account
     try_and_handle(f"docker push {organization}/{cpu_version}", dry_run)
     try_and_handle(f"docker push {organization}/{gpu_version}", dry_run)
+    try_and_handle(f"docker push {organization}/{cpp_dev_cpu_version}", dry_run)
+    try_and_handle(f"docker push {organization}/{cpp_dev_gpu_version}", dry_run)
 
     # Tag nightly images with latest
     try_and_handle(
@@ -56,10 +68,20 @@
         f"docker tag {organization}/{gpu_version} {organization}/{project}:latest-gpu",
         dry_run,
     )
+    try_and_handle(
+        f"docker tag {organization}/{cpp_dev_cpu_version} {organization}/{project}:latest-cpp-dev-cpu",
+        dry_run,
+    )
+    try_and_handle(
+        f"docker tag {organization}/{cpp_dev_gpu_version} {organization}/{project}:latest-cpp-dev-gpu",
+        dry_run,
+    )
 
     # Push images with latest tag
     try_and_handle(f"docker push {organization}/{project}:latest-cpu", dry_run)
     try_and_handle(f"docker push {organization}/{project}:latest-gpu", dry_run)
+    try_and_handle(f"docker push {organization}/{project}:latest-cpp-dev-cpu", dry_run)
+    try_and_handle(f"docker push {organization}/{project}:latest-cpp-dev-gpu", dry_run)
 
     # Cleanup built images
     if args.cleanup:
diff --git a/kubernetes/tests/configs/deployment.yaml b/kubernetes/tests/configs/deployment.yaml
@@ -0,0 +1,29 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ts-def
+  labels:
+    app: ts-def
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: ts-def
+  template:
+    metadata:
+      labels:
+        app: ts-def
+    spec:
+      volumes:
+        - name: model-store
+          hostPath:
+            path: /host/model_store
+      containers:
+        - name: torchserve
+          image: pytorch/torchserve-nightly:latest-gpu
+          ports:
+            - containerPort: 8080
+            - containerPort: 8081
+          volumeMounts:
+          - name: model-store
+            mountPath: /home/model-server/model-store
diff --git a/kubernetes/tests/docker/Dockerfile b/kubernetes/tests/docker/Dockerfile
@@ -0,0 +1,7 @@
+FROM pytorch/torchserve-nightly:latest-gpu
+ARG EXAMPLE_DIR
+USER root
+
+RUN apt-get update && apt-get install jq -y
+COPY $EXAMPLE_DIR/../docker/config.properties /home/model-server/config.properties
+USER model-server
diff --git a/kubernetes/tests/docker/config.properties b/kubernetes/tests/docker/config.properties
@@ -0,0 +1,8 @@
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
+number_of_netty_threads=32
+job_queue_size=1000
+model_store=/home/model-server/model-store
+workflow_store=/home/model-server/wf-store
+system_metrics_cmd=ts/metrics/metric_collector.py --gpu 0
diff --git a/kubernetes/tests/scripts/test_mnist.sh b/kubernetes/tests/scripts/test_mnist.sh
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+
+set -o errexit -o nounset -o pipefail
+
+ACCEPTABLE_CPU_CORE_USAGE=2
+DOCKER_IMAGE=pytorch/torchserve-nightly:latest-gpu
+
+# Get relative path of example dir with respect to root
+# Ex: if ROOT_DIR is ~/serve , EXAMPLE_DIR is ./kubernetes/tests/scripts
+EXAMPLE_DIR=$(dirname "$(readlink -f "$0")")
+ROOT_DIR=${EXAMPLE_DIR}/../../../
+ROOT_DIR=$(realpath "$ROOT_DIR")
+EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|")
+
+function start_minikube_cluster() {
+    echo "Removing any previous Kubernetes cluster"
+    minikube delete
+    echo "Starting Kubernetes cluster"
+    minikube start --gpus  all --mount-string="$GITHUB_WORKSPACE:/host" --mount
+    minikube addons enable metrics-server
+}
+
+function build_docker_image() {
+    eval $(minikube docker-env)
+    docker system prune -f
+    docker build -t $DOCKER_IMAGE --file $ROOT_DIR/$EXAMPLE_DIR/../docker/Dockerfile --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" .
+    eval $(minikube docker-env -u)
+
+}
+
+function get_model_archive() {
+    echo "Downloading archive for $2"
+    mkdir model_store -p
+    wget $1 -O model_store/"$2".mar
+    pwd
+    echo $GITHUB_WORKSPACE
+}
+
+function deploy_cluster() {
+    echo "Deploying the cluster"
+    kubectl apply -f "$1"
+    echo "Waiting for pod to come up..."
+    wait_for_pod_running "$2" 300
+    echo "Check status of the pod"
+    kubectl describe pod "$2"
+}
+
+function wait_for_pod_running() {
+    pod_name="$1"
+    max_wait_time="$2"
+    interval=5
+    start_time=$(date +%s)
+    while true; do
+        sleep "$interval"
+        pod_description=$(kubectl describe pod "$pod_name")
+        status_line=$(echo "$pod_description" | grep -E "Status:")
+        pod_status=$(echo "$status_line" | awk '{print $2}')
+        if [[ "$pod_status" == "Running" ]]; then
+            break
+        fi
+        current_time=$(date +%s)
+        if (( current_time - start_time >= max_wait_time )); then
+            echo "Timeout waiting for pod $pod_name to become Running."
+            delete_minikube_cluster
+            exit 1
+        fi
+    done
+}
+
+function delete_minikube_cluster() {
+    echo "Delete cluster"
+    minikube delete
+}
+
+function check_cpu_cores {
+
+  start_time=$(date +%s)
+  interval=10
+  while true; do
+    # Check if the Metrics API error message is present
+    if ! kubectl top pod -l app=$1 | grep -q $1 ; then
+      sleep "$interval"
+    else
+      echo "Wait for metrics output to stabilize"
+      sleep 60
+      break
+    fi
+    current_time=$(date +%s)
+    if (( current_time - start_time >= $2 )); then
+            echo "Timeout waiting for metrics information to be available"
+            delete_minikube_cluster
+            exit 1
+        fi
+  done
+  # Get the CPU cores used by the pod
+  pod_name=$(kubectl get pods -l app=$1 -o json | jq -r '.items[].metadata.name')
+  cpu=$(kubectl top pod -l app=$1 | awk "/${pod_name}/{print \$2}")
+
+  # Check if the CPU cores exceed 2
+  if [ $(echo "$cpu" | sed 's/m$//') -gt $ACCEPTABLE_CPU_CORE_USAGE ]; then
+    echo "✘ Test failed: CPU cores $(echo "$cpu" | sed 's/m$//') for $pod_name exceeded $ACCEPTABLE_CPU_CORE_USAGE" >&2
+    exit 1
+  else
+    echo "✓ SUCCESS"
+  fi
+}
+
+function make_cluster_accessible() {
+kubectl apply -f $1
+kubectl port-forward svc/ts-def 8080:8080 8081:8081 &
+sleep "$2"
+}
+
+function cleanup_port_forwarding() {
+    echo "Clean up port forwarding"
+    pkill kubectl
+}
+
+function make_prediction() {
+curl -X POST "localhost:8081/models?model_name=$1&url=$1.mar&initial_workers=1"
+PREDICTION=$(curl http://127.0.0.1:8080/predictions/$1 -T $2)
+EXPECTED="$3"
+if [ "${PREDICTION}" = "${EXPECTED}" ]; then
+    echo "✓ SUCCESS"
+    cleanup_port_forwarding
+else
+    echo "✘ Test failed: Prediction: ${PREDICTION}, expected ${EXPECTED}."
+    delete_minikube_cluster
+    exit 1
+fi
+
+}
+
+# Setup
+start_minikube_cluster
+build_docker_image
+get_model_archive "https://torchserve.pytorch.org/mar_files/mnist_v2.mar" "mnist"
+deploy_cluster "./kubernetes/tests/configs/deployment.yaml" "ts-def"
+
+echo "No model loaded CPU usage test"
+check_cpu_cores "ts-def" 180
+
+echo "MNIST test inference"
+make_cluster_accessible "kubernetes/examples/mnist/service.yaml" 5
+make_prediction "mnist"  "examples/image_classifier/mnist/test_data/0.png" "0"
+
+# Clean up
+delete_minikube_cluster