-
Notifications
You must be signed in to change notification settings - Fork 878
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Workflow for nightly kubernetes tests #3017
Changes from 11 commits
d633e19
b0dc3de
e6051ef
9cc638e
5c2514e
bb8ca4a
f2c4d77
ff0fa47
59700ec
5de9196
a28723b
efdaebf
1cf88dc
0f62def
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
name: Kubernetes Nightly Tests | ||
|
||
on: | ||
workflow_dispatch: | ||
runs everyday at 6:15am | ||
schedule: | ||
- cron: '15 6 * * *' | ||
|
||
jobs: | ||
kubernetes-tests: | ||
runs-on: [self-hosted, regression-test-gpu] | ||
steps: | ||
- name: Clean up previous run | ||
run: | | ||
echo "Cleaning up previous run" | ||
ls -la ./ | ||
sudo rm -rf ./* || true | ||
sudo rm -rf ./.??* || true | ||
ls -la ./ | ||
- name: Install minikube and kubectl | ||
run: | | ||
curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 | ||
sudo install minikube-linux-amd64 /usr/local/bin/minikube | ||
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" | ||
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl | ||
echo "/usr/local/bin" >> $GITHUB_PATH | ||
- name: Setup Python 3.9 | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: 3.9 | ||
architecture: x64 | ||
- name: Checkout TorchServe | ||
uses: actions/checkout@v3 | ||
- name: Validate TorchServe | ||
run: ./kubernetes/tests/scripts/test_mnist.sh |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: ts-def | ||
labels: | ||
app: ts-def | ||
spec: | ||
replicas: 1 | ||
selector: | ||
matchLabels: | ||
app: ts-def | ||
template: | ||
metadata: | ||
labels: | ||
app: ts-def | ||
spec: | ||
volumes: | ||
- name: model-store | ||
hostPath: | ||
path: /host/model_store | ||
containers: | ||
- name: torchserve | ||
image: pytorch/torchserve-nightly:latest-gpu | ||
ports: | ||
- containerPort: 8080 | ||
- containerPort: 8081 | ||
volumeMounts: | ||
- name: model-store | ||
mountPath: /home/model-server/model-store |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
FROM pytorch/torchserve-nightly:latest-gpu | ||
ARG EXAMPLE_DIR | ||
USER root | ||
|
||
RUN apt-get update && apt-get install jq -y | ||
COPY $EXAMPLE_DIR/../docker/config.properties /home/model-server/config.properties | ||
USER model-server |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
inference_address=http://0.0.0.0:8080 | ||
management_address=http://0.0.0.0:8081 | ||
metrics_address=http://0.0.0.0:8082 | ||
number_of_netty_threads=32 | ||
job_queue_size=1000 | ||
model_store=/home/model-server/model-store | ||
workflow_store=/home/model-server/wf-store | ||
system_metrics_cmd=ts/metrics/metric_collector.py --gpu 0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -o errexit -o nounset -o pipefail | ||
|
||
ACCEPTABLE_CPU_CORE_USAGE=2 | ||
DOCKER_IMAGE=pytorch/torchserve-nightly:latest-gpu | ||
|
||
# Get relative path of example dir | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. crazy stuff going on here lol, worth a comment or 2 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lol..added |
||
EXAMPLE_DIR=$(dirname "$(readlink -f "$0")") | ||
ROOT_DIR=${EXAMPLE_DIR}/../../../ | ||
ROOT_DIR=$(realpath "$ROOT_DIR") | ||
EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|") | ||
|
||
function start_minikube_cluster() { | ||
echo "Removing any previous Kubernetes cluster" | ||
minikube delete | ||
echo "Starting Kubernetes cluster" | ||
minikube start --gpus all --mount-string="$GITHUB_WORKSPACE:/host" --mount | ||
minikube addons enable metrics-server | ||
} | ||
|
||
function build_docker_image() { | ||
eval $(minikube docker-env) | ||
docker system prune -f | ||
docker build -t $DOCKER_IMAGE --file $ROOT_DIR/$EXAMPLE_DIR/../docker/Dockerfile --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" . | ||
eval $(minikube docker-env -u) | ||
|
||
} | ||
|
||
function get_model_archive() { | ||
echo "Downloading archive for $2" | ||
mkdir model_store -p | ||
wget $1 -O model_store/"$2".mar | ||
pwd | ||
echo $GITHUB_WORKSPACE | ||
} | ||
|
||
function deploy_cluster() { | ||
echo "Deploying the cluster" | ||
kubectl apply -f "$1" | ||
echo "Waiting for pod to come up..." | ||
wait_for_pod_running "$2" 300 | ||
echo "Check status of the pod" | ||
kubectl describe pod "$2" | ||
} | ||
|
||
function wait_for_pod_running() { | ||
pod_name="$1" | ||
max_wait_time="$2" | ||
interval=5 | ||
start_time=$(date +%s) | ||
while true; do | ||
sleep "$interval" | ||
pod_description=$(kubectl describe pod "$pod_name") | ||
status_line=$(echo "$pod_description" | grep -E "Status:") | ||
pod_status=$(echo "$status_line" | awk '{print $2}') | ||
if [[ "$pod_status" == "Running" ]]; then | ||
break | ||
fi | ||
current_time=$(date +%s) | ||
if (( current_time - start_time >= max_wait_time )); then | ||
echo "Timeout waiting for pod $pod_name to become Running." | ||
delete_minikube_cluster | ||
exit 1 | ||
fi | ||
done | ||
} | ||
|
||
function delete_minikube_cluster() { | ||
echo "Delete cluster" | ||
minikube delete | ||
} | ||
|
||
function check_cpu_cores { | ||
|
||
start_time=$(date +%s) | ||
interval=10 | ||
while true; do | ||
# Check if the Metrics API error message is present | ||
if ! kubectl top pod -l app=$1 | grep -q $1 ; then | ||
sleep "$interval" | ||
else | ||
echo "Wait for metrics output to stabilize" | ||
sleep 60 | ||
break | ||
fi | ||
current_time=$(date +%s) | ||
if (( current_time - start_time >= $2 )); then | ||
echo "Timeout waiting for metrics information to be available" | ||
delete_minikube_cluster | ||
exit 1 | ||
fi | ||
done | ||
# Get the CPU cores used by the pod | ||
pod_name=$(kubectl get pods -l app=$1 -o json | jq -r '.items[].metadata.name') | ||
cpu=$(kubectl top pod -l app=$1 | awk "/${pod_name}/{print \$2}") | ||
|
||
# Check if the CPU cores exceed 2 | ||
if [ $(echo "$cpu" | sed 's/m$//') -gt $ACCEPTABLE_CPU_CORE_USAGE ]; then | ||
echo "✘ Test failed: CPU cores $(echo "$cpu" | sed 's/m$//') for $pod_name exceeded $ACCEPTABLE_CPU_CORE_USAGE" >&2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the context of this PR, why is this an important test? add either a comment here or more details in the PR description There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. Added this in the description |
||
exit 1 | ||
else | ||
echo "✓ SUCCESS" | ||
fi | ||
} | ||
|
||
function make_cluster_accessible() { | ||
kubectl apply -f $1 | ||
kubectl port-forward svc/ts-def 8080:8080 8081:8081 & | ||
sleep "$2" | ||
} | ||
|
||
function cleanup_port_forwarding() { | ||
echo "Clean up port forwarding" | ||
pkill kubectl | ||
} | ||
|
||
function make_prediction() { | ||
curl -X POST "localhost:8081/models?model_name=$1&url=$1.mar&initial_workers=1" | ||
PREDICTION=$(curl http://127.0.0.1:8080/predictions/$1 -T $2) | ||
EXPECTED="$3" | ||
if [ "${PREDICTION}" = "${EXPECTED}" ]; then | ||
echo "✓ SUCCESS" | ||
cleanup_port_forwarding | ||
else | ||
echo "✘ Test failed: Prediction: ${PREDICTION}, expected ${EXPECTED}." | ||
delete_minikube_cluster | ||
exit 1 | ||
fi | ||
|
||
} | ||
|
||
# Setup | ||
start_minikube_cluster | ||
build_docker_image | ||
get_model_archive "https://torchserve.pytorch.org/mar_files/mnist_v2.mar" "mnist" | ||
deploy_cluster "./kubernetes/tests/configs/deployment.yaml" "ts-def" | ||
|
||
echo "No model loaded CPU usage test" | ||
check_cpu_cores "ts-def" 180 | ||
|
||
echo "MNIST test inference" | ||
make_cluster_accessible "kubernetes/examples/mnist/service.yaml" 5 | ||
make_prediction "mnist" "examples/image_classifier/mnist/test_data/0.png" "0" | ||
|
||
# Clean up | ||
delete_minikube_cluster |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there seems to be a lot of code duplication between this workflow and https://github.com/pytorch/serve/blob/master/.github/workflows/kserve_cpu_tests.yml
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, there is.. Not sure if I want to merge the two. There are testing different tech stacks. KServe tests will also probably get bigger to test OIP. Won't be adding more tests for K8s unless we have a new issue uncovered.