pytorch
diff --git a/‎.github/workflows/kubernetes_tests.yml
+35 b/‎.github/workflows/kubernetes_tests.yml
+35
diff --git a/‎.github/workflows/official_release.yml
+4 b/‎.github/workflows/official_release.yml
+4
diff --git a/‎.github/workflows/official_release_docker.yml
+2 b/‎.github/workflows/official_release_docker.yml
+2
diff --git a/‎README.md
+1 b/‎README.md
+1
diff --git a/‎SECURITY.md
+1-1 b/‎SECURITY.md
+1-1
diff --git a/‎docker/build_upload_release.py
+20 b/‎docker/build_upload_release.py
+20
diff --git a/‎docker/docker_nightly.py
+22 b/‎docker/docker_nightly.py
+22
diff --git a/‎docs/Security.md
+2 b/‎docs/Security.md
+2
diff --git a/‎examples/LLM/llama2/chat_app/Readme.md
+27 b/‎examples/LLM/llama2/chat_app/Readme.md
+27
diff --git a/‎examples/LLM/llama2/chat_app/docker/Dockerfile
+26 b/‎examples/LLM/llama2/chat_app/docker/Dockerfile
+26
diff --git a/‎examples/LLM/llama2/chat_app/docker/build_image.sh
+28 b/‎examples/LLM/llama2/chat_app/docker/build_image.sh
+28
diff --git a/‎examples/LLM/llama2/chat_app/docker/client_app.py
+118 b/‎examples/LLM/llama2/chat_app/docker/client_app.py
+118
diff --git a/‎examples/LLM/llama2/chat_app/docker/config.properties
+9 b/‎examples/LLM/llama2/chat_app/docker/config.properties
+9
@@ -0,0 +1,35 @@
+name: Kubernetes Nightly Tests
+
+on:
+  workflow_dispatch:
+  # runs everyday  at 6:15am
+  schedule:
+    - cron:  '15 6 * * *'
+
+jobs:
+  kubernetes-tests:
+    runs-on: [self-hosted, regression-test-gpu]
+    steps:
+      - name: Clean up previous run
+        run: |
+          echo "Cleaning up previous run"
+          ls -la ./
+          sudo rm -rf ./* || true
+          sudo rm -rf ./.??* || true
+          ls -la ./
+      - name: Install minikube and kubectl
+        run: |
+          curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
+          sudo install minikube-linux-amd64 /usr/local/bin/minikube
+          curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+          sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+          echo "/usr/local/bin" >> $GITHUB_PATH
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+          architecture: x64
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+      - name: Validate TorchServe
+        run: ./kubernetes/tests/scripts/test_mnist.sh
@@ -11,6 +11,10 @@ jobs:
     steps:
       - name: Setup Conda
         uses: s-weigand/setup-conda@v1
+        with:
+          activate-conda: true
+          update-conda: false
+          python-version: "3.9"
       - name: Setup Anaconda
         run: |
           conda --version
 
@@ -32,6 +32,8 @@ jobs:
           architecture: x64
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Login to Docker
         env:
           DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}}
 
@@ -5,6 +5,7 @@
 ![Benchmark Nightly](https://github.com/pytorch/serve/actions/workflows/benchmark_nightly.yml/badge.svg)
 ![Docker Regression Nightly](https://github.com/pytorch/serve/actions/workflows/regression_tests_docker.yml/badge.svg)
 ![KServe Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kserve_cpu_tests.yml/badge.svg)
+![Kubernetes Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kubernetes_tests.yml/badge.svg)
 
 TorchServe is a flexible and easy-to-use tool for serving and scaling PyTorch models in production.
 
 
@@ -42,4 +42,4 @@ TorchServe as much as possible relies on automated tools to do security scanning
 
 ## Reporting a Vulnerability
 
-If you find a serious vulnerability please report it to opensource@meta.com and [email protected]
+If you find a serious vulnerability please report it to https://www.facebook.com/whitehat and [email protected]
@@ -36,6 +36,14 @@
         f"./build_image.sh -g -cv cu121 -t {organization}/torchserve:latest-gpu",
         dry_run,
     )
+    try_and_handle(
+        f"./build_image.sh -bt dev -cpp -t {organization}/torchserve:latest-cpp-dev-cpu",
+        dry_run,
+    )
+    try_and_handle(
+        f"./build_image.sh -bt dev -g -cv cu121 -cpp -t {organization}/torchserve:latest-cpp-dev-gpu",
+        dry_run,
+    )
     try_and_handle(
         f"docker tag {organization}/torchserve:latest {organization}/torchserve:latest-cpu",
         dry_run,
@@ -48,13 +56,25 @@
         f"docker tag {organization}/torchserve:latest-gpu {organization}/torchserve:{check_ts_version()}-gpu",
         dry_run,
     )
+    try_and_handle(
+        f"docker tag {organization}/torchserve:latest-cpp-dev-cpu {organization}/torchserve:{check_ts_version()}-cpp-dev-cpu",
+        dry_run,
+    )
+    try_and_handle(
+        f"docker tag {organization}/torchserve:latest-cpp-dev-gpu {organization}/torchserve:{check_ts_version()}-cpp-dev-gpu",
+        dry_run,
+    )
 
     for image in [
         f"{organization}/torchserve:latest",
         f"{organization}/torchserve:latest-cpu",
         f"{organization}/torchserve:latest-gpu",
+        f"{organization}/torchserve:latest-cpp-dev-cpu",
+        f"{organization}/torchserve:latest-cpp-dev-gpu",
         f"{organization}/torchserve:{check_ts_version()}-cpu",
         f"{organization}/torchserve:{check_ts_version()}-gpu",
+        f"{organization}/torchserve:{check_ts_version()}-cpp-dev-cpu",
+        f"{organization}/torchserve:{check_ts_version()}-cpp-dev-gpu",
     ]:
         try_and_handle(f"docker push {image}", dry_run)
 
 
@@ -35,17 +35,29 @@
     project = "torchserve-nightly"
     cpu_version = f"{project}:cpu-{get_nightly_version()}"
     gpu_version = f"{project}:gpu-{get_nightly_version()}"
+    cpp_dev_cpu_version = f"{project}:cpp-dev-cpu-{get_nightly_version()}"
+    cpp_dev_gpu_version = f"{project}:cpp-dev-gpu-{get_nightly_version()}"
 
     # Build Nightly images and append the date in the name
     try_and_handle(f"./build_image.sh -n -t {organization}/{cpu_version}", dry_run)
     try_and_handle(
         f"./build_image.sh -g -cv cu121 -n -t {organization}/{gpu_version}",
         dry_run,
     )
+    try_and_handle(
+        f"./build_image.sh -bt dev -cpp -t {organization}/{cpp_dev_cpu_version}",
+        dry_run,
+    )
+    try_and_handle(
+        f"./build_image.sh -bt dev -g -cv cu121 -cpp -t {organization}/{cpp_dev_gpu_version}",
+        dry_run,
+    )
 
     # Push Nightly images to official PyTorch Dockerhub account
     try_and_handle(f"docker push {organization}/{cpu_version}", dry_run)
     try_and_handle(f"docker push {organization}/{gpu_version}", dry_run)
+    try_and_handle(f"docker push {organization}/{cpp_dev_cpu_version}", dry_run)
+    try_and_handle(f"docker push {organization}/{cpp_dev_gpu_version}", dry_run)
 
     # Tag nightly images with latest
     try_and_handle(
@@ -56,10 +68,20 @@
         f"docker tag {organization}/{gpu_version} {organization}/{project}:latest-gpu",
         dry_run,
     )
+    try_and_handle(
+        f"docker tag {organization}/{cpp_dev_cpu_version} {organization}/{project}:latest-cpp-dev-cpu",
+        dry_run,
+    )
+    try_and_handle(
+        f"docker tag {organization}/{cpp_dev_gpu_version} {organization}/{project}:latest-cpp-dev-gpu",
+        dry_run,
+    )
 
     # Push images with latest tag
     try_and_handle(f"docker push {organization}/{project}:latest-cpu", dry_run)
     try_and_handle(f"docker push {organization}/{project}:latest-gpu", dry_run)
+    try_and_handle(f"docker push {organization}/{project}:latest-cpp-dev-cpu", dry_run)
+    try_and_handle(f"docker push {organization}/{project}:latest-cpp-dev-gpu", dry_run)
 
     # Cleanup built images
     if args.cleanup:
 
@@ -5,6 +5,7 @@
 | Version | Supported          |
 |---------| ------------------ |
 | 0.9.0   | :white_check_mark: |
+| 0.10.0  | :white_check_mark: |
 
 
 ## How we do security
@@ -36,6 +37,7 @@ TorchServe as much as possible relies on automated tools to do security scanning
     2. Using private-key/certificate files
 
     You can find more details in the [configuration guide](https://pytorch.org/serve/configuration.html#enable-ssl)
+6. TorchServe supports token authorization: check [documentation](https://github.com/pytorch/serve/blob/master/docs/token_authorization_api.md) for more information. 
 
 
 
 
@@ -9,6 +9,33 @@ We are using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) in
 You can run this example on your laptop to understand how to use TorchServe
 
 
+## Quick Start Guide
+
+To get started with TorchServe, you need to run the following
+
+```
+# 1: Set HF Token as Env variable
+export HUGGINGFACE_TOKEN=<Token> # get this from your HuggingFace account
+
+# 2: Build TorchServe Image for Serving llama2-7b model with 4-bit quantization
+./examples/llm/llama2/chat_app/docker/build_image.sh meta-llama/Llama-2-7b-chat-hf
+
+# 3: Launch the streamlit app for server & client
+docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v <model-store>:/home/model-server/model-store pytorch/torchserve:meta-llama---Llama-2-7b-chat-hf
+```
+In step 3, `<model-store>` is a location where you want the model to be downloaded
+
+### What to expect
+This launches two streamlit apps
+1. TorchServe Server app to start/stop TorchServe, load model, scale up/down workers, configure dynamic batch_size ( Currently llama-cpp-python doesn't support batch_size > 1)
+    - Since this app is targeted for Apple M1/M2 laptops, we load a 4-bit quantized version of llama2 using llama-cpp-python.
+2.  Client chat app where you can chat with the model . There is a slider to send concurrent requests to the model. The current app doesn't have a good mechanism to show multiple responses in parallel. You can notice streaming response for the first request followed by a complete response for the next request.
+
+Currently, this launches llama2-7b model with 4-bit quantization running on CPU.
+
+To make use of M1/M2 GPU, you can follow the below guide to do a standalone TorchServe installation.
+
+
 ## Architecture
 
 ![Chatbot Architecture](./screenshots/architecture.png)
 
@@ -0,0 +1,26 @@
+ARG BASE_IMAGE=pytorch/torchserve:latest-gpu
+
+FROM $BASE_IMAGE as server
+ARG BASE_IMAGE
+ARG EXAMPLE_DIR
+ARG MODEL_NAME
+ARG HUGGINGFACE_TOKEN
+
+USER root
+
+ENV MODEL_NAME=$MODEL_NAME
+
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    apt-get update && \
+    apt-get install libopenmpi-dev git -y
+
+COPY $EXAMPLE_DIR/requirements.txt /home/model-server/chat_bot/requirements.txt
+RUN pip install -r /home/model-server/chat_bot/requirements.txt && huggingface-cli login --token $HUGGINGFACE_TOKEN
+
+COPY $EXAMPLE_DIR  /home/model-server/chat_bot
+COPY $EXAMPLE_DIR/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh
+COPY $EXAMPLE_DIR/config.properties /home/model-server/config.properties
+
+WORKDIR /home/model-server/chat_bot
+RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh \
+    && chown -R model-server /home/model-server
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Check if there are enough arguments
+if [ "$#" -eq 0 ] || [ "$#" -gt 1 ]; then
+  echo "Usage: $0 <HF Model>"
+  exit 1
+fi
+
+MODEL_NAME=$(echo "$1" | sed 's/\//---/g')
+echo "Model: " $MODEL_NAME
+
+BASE_IMAGE="pytorch/torchserve:latest-cpu"
+
+DOCKER_TAG="pytorch/torchserve:${MODEL_NAME}"
+
+# Get relative path of example dir
+EXAMPLE_DIR=$(dirname "$(readlink -f "$0")")
+ROOT_DIR=${EXAMPLE_DIR}/../../../../..
+ROOT_DIR=$(realpath "$ROOT_DIR")
+EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|")
+
+# Build docker image for the application
+DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --file ${EXAMPLE_DIR}/Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" --build-arg MODEL_NAME="${MODEL_NAME}"  --build-arg HUGGINGFACE_TOKEN -t "${DOCKER_TAG}" .
+
+echo "Run the following command to start the chat bot"
+echo ""
+echo docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v $(pwd)/model_store_1:/home/model-server/model-store $DOCKER_TAG
+echo ""
@@ -0,0 +1,118 @@
+import json
+import os
+from concurrent.futures import ThreadPoolExecutor
+
+import requests
+import streamlit as st
+
+MODEL_NAME = os.environ["MODEL_NAME"]
+
+# App title
+st.set_page_config(page_title="TorchServe Chatbot")
+
+with st.sidebar:
+    st.title("TorchServe Chatbot")
+
+    st.session_state.model_loaded = False
+    try:
+        res = requests.get(url="http://localhost:8080/ping")
+        res = requests.get(url=f"http://localhost:8081/models/{MODEL_NAME}")
+        status = "NOT READY"
+        if res.status_code == 200:
+            status = json.loads(res.text)[0]["workers"][0]["status"]
+
+        if status == "READY":
+            st.session_state.model_loaded = True
+            st.success("Proceed to entering your prompt message!", icon="👉")
+        else:
+            st.warning("Model not loaded in TorchServe", icon="⚠️")
+
+    except requests.ConnectionError:
+        st.warning("TorchServe is not up. Try again", icon="⚠️")
+
+    if st.session_state.model_loaded:
+        st.success(f"Model loaded: {MODEL_NAME}!", icon="👉")
+
+    st.subheader("Model parameters")
+    temperature = st.sidebar.slider(
+        "temperature", min_value=0.1, max_value=1.0, value=0.5, step=0.1
+    )
+    top_p = st.sidebar.slider(
+        "top_p", min_value=0.1, max_value=1.0, value=0.5, step=0.1
+    )
+    max_new_tokens = st.sidebar.slider(
+        "max_new_tokens", min_value=48, max_value=512, value=50, step=4
+    )
+    concurrent_requests = st.sidebar.select_slider(
+        "concurrent_requests", options=[2**j for j in range(0, 8)]
+    )
+
+# Store LLM generated responses
+if "messages" not in st.session_state.keys():
+    st.session_state.messages = [
+        {"role": "assistant", "content": "How may I assist you today?"}
+    ]
+
+# Display or clear chat messages
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.write(message["content"])
+
+
+def clear_chat_history():
+    st.session_state.messages = [
+        {"role": "assistant", "content": "How may I assist you today?"}
+    ]
+
+
+st.sidebar.button("Clear Chat History", on_click=clear_chat_history)
+
+
+def generate_model_response(prompt_input, executor):
+    string_dialogue = (
+        "Question: What are the names of the planets in the solar system? Answer: "
+    )
+    headers = {"Content-type": "application/json", "Accept": "text/plain"}
+    url = f"http://127.0.0.1:8080/predictions/{MODEL_NAME}"
+    data = json.dumps(
+        {
+            "prompt": prompt_input,
+            "params": {
+                "max_new_tokens": max_new_tokens,
+                "top_p": top_p,
+                "temperature": temperature,
+            },
+        }
+    )
+    res = [
+        executor.submit(requests.post, url=url, data=data, headers=headers, stream=True)
+        for i in range(concurrent_requests)
+    ]
+
+    return res, max_new_tokens
+
+
+# User-provided prompt
+if prompt := st.chat_input():
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.write(prompt)
+
+# Generate a new response if last message is not from assistant
+if st.session_state.messages[-1]["role"] != "assistant":
+    with st.chat_message("assistant"):
+        with st.spinner("Thinking..."):
+            with ThreadPoolExecutor() as executor:
+                futures, max_tokens = generate_model_response(prompt, executor)
+                placeholder = st.empty()
+                full_response = ""
+                count = 0
+                for future in futures:
+                    response = future.result()
+                    for chunk in response.iter_content(chunk_size=None):
+                        if chunk:
+                            data = chunk.decode("utf-8")
+                            full_response += data
+                            placeholder.markdown(full_response)
+    message = {"role": "assistant", "content": full_response}
+    st.session_state.messages.append(message)
@@ -0,0 +1,9 @@
+metrics_mode=prometheus
+model_metrics_auto_detect=true
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
+number_of_netty_threads=32
+job_queue_size=1000
+model_store=/home/model-server/model-store
+workflow_store=/home/model-server/wf-store
Original file line number	Diff line number	Diff line change
`@@ -42,4 +42,4 @@ TorchServe as much as possible relies on automated tools to do security scanning`
`42`	`42`
`43`	`43`	`## Reporting a Vulnerability`
`44`	`44`
`45`		`-If you find a serious vulnerability please report it to opensource@meta.com and [email protected]`
	`45`	`+If you find a serious vulnerability please report it to https://www.facebook.com/whitehat and [email protected]`