Skip to content

Commit 7d303f5

Browse files
authored
Merge branch 'master' into examples/getting_started_curl
2 parents 0dcb5ca + 1994aa0 commit 7d303f5

23 files changed

+850
-2
lines changed
+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
name: Kubernetes Nightly Tests
2+
3+
on:
4+
workflow_dispatch:
5+
# runs everyday at 6:15am
6+
schedule:
7+
- cron: '15 6 * * *'
8+
9+
jobs:
10+
kubernetes-tests:
11+
runs-on: [self-hosted, regression-test-gpu]
12+
steps:
13+
- name: Clean up previous run
14+
run: |
15+
echo "Cleaning up previous run"
16+
ls -la ./
17+
sudo rm -rf ./* || true
18+
sudo rm -rf ./.??* || true
19+
ls -la ./
20+
- name: Install minikube and kubectl
21+
run: |
22+
curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
23+
sudo install minikube-linux-amd64 /usr/local/bin/minikube
24+
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
25+
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
26+
echo "/usr/local/bin" >> $GITHUB_PATH
27+
- name: Setup Python 3.9
28+
uses: actions/setup-python@v5
29+
with:
30+
python-version: 3.9
31+
architecture: x64
32+
- name: Checkout TorchServe
33+
uses: actions/checkout@v3
34+
- name: Validate TorchServe
35+
run: ./kubernetes/tests/scripts/test_mnist.sh

.github/workflows/official_release.yml

+4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ jobs:
1111
steps:
1212
- name: Setup Conda
1313
uses: s-weigand/setup-conda@v1
14+
with:
15+
activate-conda: true
16+
update-conda: false
17+
python-version: "3.9"
1418
- name: Setup Anaconda
1519
run: |
1620
conda --version

.github/workflows/official_release_docker.yml

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ jobs:
3232
architecture: x64
3333
- name: Checkout TorchServe
3434
uses: actions/checkout@v3
35+
with:
36+
submodules: recursive
3537
- name: Login to Docker
3638
env:
3739
DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}}

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
![Benchmark Nightly](https://github.com/pytorch/serve/actions/workflows/benchmark_nightly.yml/badge.svg)
66
![Docker Regression Nightly](https://github.com/pytorch/serve/actions/workflows/regression_tests_docker.yml/badge.svg)
77
![KServe Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kserve_cpu_tests.yml/badge.svg)
8+
![Kubernetes Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kubernetes_tests.yml/badge.svg)
89

910
TorchServe is a flexible and easy-to-use tool for serving and scaling PyTorch models in production.
1011

SECURITY.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,4 @@ TorchServe as much as possible relies on automated tools to do security scanning
4242

4343
## Reporting a Vulnerability
4444

45-
If you find a serious vulnerability please report it to opensource@meta.com and [email protected]
45+
If you find a serious vulnerability please report it to https://www.facebook.com/whitehat and [email protected]

docker/build_upload_release.py

+20
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@
3636
f"./build_image.sh -g -cv cu121 -t {organization}/torchserve:latest-gpu",
3737
dry_run,
3838
)
39+
try_and_handle(
40+
f"./build_image.sh -bt dev -cpp -t {organization}/torchserve:latest-cpp-dev-cpu",
41+
dry_run,
42+
)
43+
try_and_handle(
44+
f"./build_image.sh -bt dev -g -cv cu121 -cpp -t {organization}/torchserve:latest-cpp-dev-gpu",
45+
dry_run,
46+
)
3947
try_and_handle(
4048
f"docker tag {organization}/torchserve:latest {organization}/torchserve:latest-cpu",
4149
dry_run,
@@ -48,13 +56,25 @@
4856
f"docker tag {organization}/torchserve:latest-gpu {organization}/torchserve:{check_ts_version()}-gpu",
4957
dry_run,
5058
)
59+
try_and_handle(
60+
f"docker tag {organization}/torchserve:latest-cpp-dev-cpu {organization}/torchserve:{check_ts_version()}-cpp-dev-cpu",
61+
dry_run,
62+
)
63+
try_and_handle(
64+
f"docker tag {organization}/torchserve:latest-cpp-dev-gpu {organization}/torchserve:{check_ts_version()}-cpp-dev-gpu",
65+
dry_run,
66+
)
5167

5268
for image in [
5369
f"{organization}/torchserve:latest",
5470
f"{organization}/torchserve:latest-cpu",
5571
f"{organization}/torchserve:latest-gpu",
72+
f"{organization}/torchserve:latest-cpp-dev-cpu",
73+
f"{organization}/torchserve:latest-cpp-dev-gpu",
5674
f"{organization}/torchserve:{check_ts_version()}-cpu",
5775
f"{organization}/torchserve:{check_ts_version()}-gpu",
76+
f"{organization}/torchserve:{check_ts_version()}-cpp-dev-cpu",
77+
f"{organization}/torchserve:{check_ts_version()}-cpp-dev-gpu",
5878
]:
5979
try_and_handle(f"docker push {image}", dry_run)
6080

docker/docker_nightly.py

+22
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,29 @@
3535
project = "torchserve-nightly"
3636
cpu_version = f"{project}:cpu-{get_nightly_version()}"
3737
gpu_version = f"{project}:gpu-{get_nightly_version()}"
38+
cpp_dev_cpu_version = f"{project}:cpp-dev-cpu-{get_nightly_version()}"
39+
cpp_dev_gpu_version = f"{project}:cpp-dev-gpu-{get_nightly_version()}"
3840

3941
# Build Nightly images and append the date in the name
4042
try_and_handle(f"./build_image.sh -n -t {organization}/{cpu_version}", dry_run)
4143
try_and_handle(
4244
f"./build_image.sh -g -cv cu121 -n -t {organization}/{gpu_version}",
4345
dry_run,
4446
)
47+
try_and_handle(
48+
f"./build_image.sh -bt dev -cpp -t {organization}/{cpp_dev_cpu_version}",
49+
dry_run,
50+
)
51+
try_and_handle(
52+
f"./build_image.sh -bt dev -g -cv cu121 -cpp -t {organization}/{cpp_dev_gpu_version}",
53+
dry_run,
54+
)
4555

4656
# Push Nightly images to official PyTorch Dockerhub account
4757
try_and_handle(f"docker push {organization}/{cpu_version}", dry_run)
4858
try_and_handle(f"docker push {organization}/{gpu_version}", dry_run)
59+
try_and_handle(f"docker push {organization}/{cpp_dev_cpu_version}", dry_run)
60+
try_and_handle(f"docker push {organization}/{cpp_dev_gpu_version}", dry_run)
4961

5062
# Tag nightly images with latest
5163
try_and_handle(
@@ -56,10 +68,20 @@
5668
f"docker tag {organization}/{gpu_version} {organization}/{project}:latest-gpu",
5769
dry_run,
5870
)
71+
try_and_handle(
72+
f"docker tag {organization}/{cpp_dev_cpu_version} {organization}/{project}:latest-cpp-dev-cpu",
73+
dry_run,
74+
)
75+
try_and_handle(
76+
f"docker tag {organization}/{cpp_dev_gpu_version} {organization}/{project}:latest-cpp-dev-gpu",
77+
dry_run,
78+
)
5979

6080
# Push images with latest tag
6181
try_and_handle(f"docker push {organization}/{project}:latest-cpu", dry_run)
6282
try_and_handle(f"docker push {organization}/{project}:latest-gpu", dry_run)
83+
try_and_handle(f"docker push {organization}/{project}:latest-cpp-dev-cpu", dry_run)
84+
try_and_handle(f"docker push {organization}/{project}:latest-cpp-dev-gpu", dry_run)
6385

6486
# Cleanup built images
6587
if args.cleanup:

docs/Security.md

+2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
| Version | Supported |
66
|---------| ------------------ |
77
| 0.9.0 | :white_check_mark: |
8+
| 0.10.0 | :white_check_mark: |
89

910

1011
## How we do security
@@ -36,6 +37,7 @@ TorchServe as much as possible relies on automated tools to do security scanning
3637
2. Using private-key/certificate files
3738

3839
You can find more details in the [configuration guide](https://pytorch.org/serve/configuration.html#enable-ssl)
40+
6. TorchServe supports token authorization: check [documentation](https://github.com/pytorch/serve/blob/master/docs/token_authorization_api.md) for more information.
3941

4042

4143

examples/LLM/llama2/chat_app/Readme.md

+27
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,33 @@ We are using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) in
99
You can run this example on your laptop to understand how to use TorchServe
1010

1111

12+
## Quick Start Guide
13+
14+
To get started with TorchServe, you need to run the following
15+
16+
```
17+
# 1: Set HF Token as Env variable
18+
export HUGGINGFACE_TOKEN=<Token> # get this from your HuggingFace account
19+
20+
# 2: Build TorchServe Image for Serving llama2-7b model with 4-bit quantization
21+
./examples/llm/llama2/chat_app/docker/build_image.sh meta-llama/Llama-2-7b-chat-hf
22+
23+
# 3: Launch the streamlit app for server & client
24+
docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v <model-store>:/home/model-server/model-store pytorch/torchserve:meta-llama---Llama-2-7b-chat-hf
25+
```
26+
In step 3, `<model-store>` is a location where you want the model to be downloaded
27+
28+
### What to expect
29+
This launches two streamlit apps
30+
1. TorchServe Server app to start/stop TorchServe, load model, scale up/down workers, configure dynamic batch_size ( Currently llama-cpp-python doesn't support batch_size > 1)
31+
- Since this app is targeted for Apple M1/M2 laptops, we load a 4-bit quantized version of llama2 using llama-cpp-python.
32+
2. Client chat app where you can chat with the model . There is a slider to send concurrent requests to the model. The current app doesn't have a good mechanism to show multiple responses in parallel. You can notice streaming response for the first request followed by a complete response for the next request.
33+
34+
Currently, this launches llama2-7b model with 4-bit quantization running on CPU.
35+
36+
To make use of M1/M2 GPU, you can follow the below guide to do a standalone TorchServe installation.
37+
38+
1239
## Architecture
1340

1441
![Chatbot Architecture](./screenshots/architecture.png)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
ARG BASE_IMAGE=pytorch/torchserve:latest-gpu
2+
3+
FROM $BASE_IMAGE as server
4+
ARG BASE_IMAGE
5+
ARG EXAMPLE_DIR
6+
ARG MODEL_NAME
7+
ARG HUGGINGFACE_TOKEN
8+
9+
USER root
10+
11+
ENV MODEL_NAME=$MODEL_NAME
12+
13+
RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
14+
apt-get update && \
15+
apt-get install libopenmpi-dev git -y
16+
17+
COPY $EXAMPLE_DIR/requirements.txt /home/model-server/chat_bot/requirements.txt
18+
RUN pip install -r /home/model-server/chat_bot/requirements.txt && huggingface-cli login --token $HUGGINGFACE_TOKEN
19+
20+
COPY $EXAMPLE_DIR /home/model-server/chat_bot
21+
COPY $EXAMPLE_DIR/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh
22+
COPY $EXAMPLE_DIR/config.properties /home/model-server/config.properties
23+
24+
WORKDIR /home/model-server/chat_bot
25+
RUN chmod +x /usr/local/bin/dockerd-entrypoint.sh \
26+
&& chown -R model-server /home/model-server
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/bin/bash
2+
3+
# Check if there are enough arguments
4+
if [ "$#" -eq 0 ] || [ "$#" -gt 1 ]; then
5+
echo "Usage: $0 <HF Model>"
6+
exit 1
7+
fi
8+
9+
MODEL_NAME=$(echo "$1" | sed 's/\//---/g')
10+
echo "Model: " $MODEL_NAME
11+
12+
BASE_IMAGE="pytorch/torchserve:latest-cpu"
13+
14+
DOCKER_TAG="pytorch/torchserve:${MODEL_NAME}"
15+
16+
# Get relative path of example dir
17+
EXAMPLE_DIR=$(dirname "$(readlink -f "$0")")
18+
ROOT_DIR=${EXAMPLE_DIR}/../../../../..
19+
ROOT_DIR=$(realpath "$ROOT_DIR")
20+
EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|")
21+
22+
# Build docker image for the application
23+
DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --file ${EXAMPLE_DIR}/Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" --build-arg MODEL_NAME="${MODEL_NAME}" --build-arg HUGGINGFACE_TOKEN -t "${DOCKER_TAG}" .
24+
25+
echo "Run the following command to start the chat bot"
26+
echo ""
27+
echo docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v $(pwd)/model_store_1:/home/model-server/model-store $DOCKER_TAG
28+
echo ""
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import json
2+
import os
3+
from concurrent.futures import ThreadPoolExecutor
4+
5+
import requests
6+
import streamlit as st
7+
8+
MODEL_NAME = os.environ["MODEL_NAME"]
9+
10+
# App title
11+
st.set_page_config(page_title="TorchServe Chatbot")
12+
13+
with st.sidebar:
14+
st.title("TorchServe Chatbot")
15+
16+
st.session_state.model_loaded = False
17+
try:
18+
res = requests.get(url="http://localhost:8080/ping")
19+
res = requests.get(url=f"http://localhost:8081/models/{MODEL_NAME}")
20+
status = "NOT READY"
21+
if res.status_code == 200:
22+
status = json.loads(res.text)[0]["workers"][0]["status"]
23+
24+
if status == "READY":
25+
st.session_state.model_loaded = True
26+
st.success("Proceed to entering your prompt message!", icon="👉")
27+
else:
28+
st.warning("Model not loaded in TorchServe", icon="⚠️")
29+
30+
except requests.ConnectionError:
31+
st.warning("TorchServe is not up. Try again", icon="⚠️")
32+
33+
if st.session_state.model_loaded:
34+
st.success(f"Model loaded: {MODEL_NAME}!", icon="👉")
35+
36+
st.subheader("Model parameters")
37+
temperature = st.sidebar.slider(
38+
"temperature", min_value=0.1, max_value=1.0, value=0.5, step=0.1
39+
)
40+
top_p = st.sidebar.slider(
41+
"top_p", min_value=0.1, max_value=1.0, value=0.5, step=0.1
42+
)
43+
max_new_tokens = st.sidebar.slider(
44+
"max_new_tokens", min_value=48, max_value=512, value=50, step=4
45+
)
46+
concurrent_requests = st.sidebar.select_slider(
47+
"concurrent_requests", options=[2**j for j in range(0, 8)]
48+
)
49+
50+
# Store LLM generated responses
51+
if "messages" not in st.session_state.keys():
52+
st.session_state.messages = [
53+
{"role": "assistant", "content": "How may I assist you today?"}
54+
]
55+
56+
# Display or clear chat messages
57+
for message in st.session_state.messages:
58+
with st.chat_message(message["role"]):
59+
st.write(message["content"])
60+
61+
62+
def clear_chat_history():
63+
st.session_state.messages = [
64+
{"role": "assistant", "content": "How may I assist you today?"}
65+
]
66+
67+
68+
st.sidebar.button("Clear Chat History", on_click=clear_chat_history)
69+
70+
71+
def generate_model_response(prompt_input, executor):
72+
string_dialogue = (
73+
"Question: What are the names of the planets in the solar system? Answer: "
74+
)
75+
headers = {"Content-type": "application/json", "Accept": "text/plain"}
76+
url = f"http://127.0.0.1:8080/predictions/{MODEL_NAME}"
77+
data = json.dumps(
78+
{
79+
"prompt": prompt_input,
80+
"params": {
81+
"max_new_tokens": max_new_tokens,
82+
"top_p": top_p,
83+
"temperature": temperature,
84+
},
85+
}
86+
)
87+
res = [
88+
executor.submit(requests.post, url=url, data=data, headers=headers, stream=True)
89+
for i in range(concurrent_requests)
90+
]
91+
92+
return res, max_new_tokens
93+
94+
95+
# User-provided prompt
96+
if prompt := st.chat_input():
97+
st.session_state.messages.append({"role": "user", "content": prompt})
98+
with st.chat_message("user"):
99+
st.write(prompt)
100+
101+
# Generate a new response if last message is not from assistant
102+
if st.session_state.messages[-1]["role"] != "assistant":
103+
with st.chat_message("assistant"):
104+
with st.spinner("Thinking..."):
105+
with ThreadPoolExecutor() as executor:
106+
futures, max_tokens = generate_model_response(prompt, executor)
107+
placeholder = st.empty()
108+
full_response = ""
109+
count = 0
110+
for future in futures:
111+
response = future.result()
112+
for chunk in response.iter_content(chunk_size=None):
113+
if chunk:
114+
data = chunk.decode("utf-8")
115+
full_response += data
116+
placeholder.markdown(full_response)
117+
message = {"role": "assistant", "content": full_response}
118+
st.session_state.messages.append(message)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
metrics_mode=prometheus
2+
model_metrics_auto_detect=true
3+
inference_address=http://0.0.0.0:8080
4+
management_address=http://0.0.0.0:8081
5+
metrics_address=http://0.0.0.0:8082
6+
number_of_netty_threads=32
7+
job_queue_size=1000
8+
model_store=/home/model-server/model-store
9+
workflow_store=/home/model-server/wf-store

0 commit comments

Comments
 (0)