pytorch · May 3, 2024
diff --git a/‎README.md
+3-3 b/‎README.md
+3-3
diff --git a/‎examples/LLM/llama2/README.md ‎examples/LLM/llama/README.md
+9-9 b/‎examples/LLM/llama2/README.md ‎examples/LLM/llama/README.md
+9-9
diff --git a/‎examples/LLM/llama2/chat_app/Readme.md ‎examples/LLM/llama/chat_app/Readme.md
+12-12 b/‎examples/LLM/llama2/chat_app/Readme.md ‎examples/LLM/llama/chat_app/Readme.md
+12-12
diff --git a/‎examples/LLM/llama2/chat_app/client_app.py ‎examples/LLM/llama/chat_app/client_app.py b/‎examples/LLM/llama2/chat_app/client_app.py ‎examples/LLM/llama/chat_app/client_app.py
diff --git a/‎examples/LLM/llama2/chat_app/docker/Dockerfile ‎examples/LLM/llama/chat_app/docker/Dockerfile
+6-3 b/‎examples/LLM/llama2/chat_app/docker/Dockerfile ‎examples/LLM/llama/chat_app/docker/Dockerfile
+6-3
diff --git a/‎examples/LLM/llama/chat_app/docker/Download_model.py
+53 b/‎examples/LLM/llama/chat_app/docker/Download_model.py
+53
diff --git a/‎examples/LLM/llama2/chat_app/docker/build_image.sh ‎examples/LLM/llama/chat_app/docker/build_image.sh
+4-12 b/‎examples/LLM/llama2/chat_app/docker/build_image.sh ‎examples/LLM/llama/chat_app/docker/build_image.sh
+4-12
diff --git a/‎examples/LLM/llama2/chat_app/docker/client_app.py ‎examples/LLM/llama/chat_app/docker/client_app.py
+1 b/‎examples/LLM/llama2/chat_app/docker/client_app.py ‎examples/LLM/llama/chat_app/docker/client_app.py
+1
diff --git a/‎examples/LLM/llama2/chat_app/docker/config.properties ‎examples/LLM/llama/chat_app/docker/config.properties b/‎examples/LLM/llama2/chat_app/docker/config.properties ‎examples/LLM/llama/chat_app/docker/config.properties
diff --git a/‎examples/LLM/llama/chat_app/docker/dockerd-entrypoint.sh
+81 b/‎examples/LLM/llama/chat_app/docker/dockerd-entrypoint.sh
+81
diff --git a/‎examples/LLM/llama2/chat_app/docker/llama_cpp_handler.py ‎examples/LLM/llama/chat_app/docker/llama_cpp_handler.py
+1-1 b/‎examples/LLM/llama2/chat_app/docker/llama_cpp_handler.py ‎examples/LLM/llama/chat_app/docker/llama_cpp_handler.py
+1-1
diff --git a/‎examples/LLM/llama2/chat_app/docker/requirements.txt ‎examples/LLM/llama/chat_app/docker/requirements.txt b/‎examples/LLM/llama2/chat_app/docker/requirements.txt ‎examples/LLM/llama/chat_app/docker/requirements.txt
diff --git a/‎examples/LLM/llama2/chat_app/docker/torchserve_server_app.py ‎examples/LLM/llama/chat_app/docker/torchserve_server_app.py
+1 b/‎examples/LLM/llama2/chat_app/docker/torchserve_server_app.py ‎examples/LLM/llama/chat_app/docker/torchserve_server_app.py
+1
diff --git a/‎examples/LLM/llama2/chat_app/llama_cpp_handler.py ‎examples/LLM/llama/chat_app/llama_cpp_handler.py b/‎examples/LLM/llama2/chat_app/llama_cpp_handler.py ‎examples/LLM/llama/chat_app/llama_cpp_handler.py
diff --git a/‎examples/LLM/llama2/chat_app/model-config.yaml ‎examples/LLM/llama/chat_app/model-config.yaml b/‎examples/LLM/llama2/chat_app/model-config.yaml ‎examples/LLM/llama/chat_app/model-config.yaml
diff --git a/‎examples/LLM/llama2/chat_app/package_llama.sh ‎examples/LLM/llama/chat_app/package_llama.sh
+14-11 b/‎examples/LLM/llama2/chat_app/package_llama.sh ‎examples/LLM/llama/chat_app/package_llama.sh
+14-11
diff --git a/‎examples/LLM/llama/chat_app/requirements.txt
+1 b/‎examples/LLM/llama/chat_app/requirements.txt
+1
diff --git a/‎examples/LLM/llama2/chat_app/screenshots/Client.png ‎examples/LLM/llama/chat_app/screenshots/Client.png b/‎examples/LLM/llama2/chat_app/screenshots/Client.png ‎examples/LLM/llama/chat_app/screenshots/Client.png
diff --git a/‎examples/LLM/llama2/chat_app/screenshots/Server.png ‎examples/LLM/llama/chat_app/screenshots/Server.png b/‎examples/LLM/llama2/chat_app/screenshots/Server.png ‎examples/LLM/llama/chat_app/screenshots/Server.png
diff --git a/‎examples/LLM/llama2/chat_app/screenshots/Workers.png ‎examples/LLM/llama/chat_app/screenshots/Workers.png b/‎examples/LLM/llama2/chat_app/screenshots/Workers.png ‎examples/LLM/llama/chat_app/screenshots/Workers.png
diff --git a/‎examples/LLM/llama2/chat_app/screenshots/architecture.png ‎examples/LLM/llama/chat_app/screenshots/architecture.png b/‎examples/LLM/llama2/chat_app/screenshots/architecture.png ‎examples/LLM/llama/chat_app/screenshots/architecture.png
diff --git a/‎examples/LLM/llama2/chat_app/screenshots/batch_size.png ‎examples/LLM/llama/chat_app/screenshots/batch_size.png b/‎examples/LLM/llama2/chat_app/screenshots/batch_size.png ‎examples/LLM/llama/chat_app/screenshots/batch_size.png
diff --git a/‎examples/LLM/llama2/chat_app/torchserve_server_app.py ‎examples/LLM/llama/chat_app/torchserve_server_app.py b/‎examples/LLM/llama2/chat_app/torchserve_server_app.py ‎examples/LLM/llama/chat_app/torchserve_server_app.py
diff --git a/‎examples/LLM/llama2/images/llama.png ‎examples/LLM/llama/images/llama.png b/‎examples/LLM/llama2/images/llama.png ‎examples/LLM/llama/images/llama.png
diff --git a/‎examples/LLM/llama2/images/software_stack_inf2.jpg ‎examples/LLM/llama/images/software_stack_inf2.jpg b/‎examples/LLM/llama2/images/software_stack_inf2.jpg ‎examples/LLM/llama/images/software_stack_inf2.jpg
diff --git a/‎examples/LLM/llama2/chat_app/docker/dockerd-entrypoint.sh
-81 b/‎examples/LLM/llama2/chat_app/docker/dockerd-entrypoint.sh
-81
diff --git a/‎examples/LLM/llama2/chat_app/requirements.txt
-1 b/‎examples/LLM/llama2/chat_app/requirements.txt
-1
@@ -79,7 +79,7 @@ Refer to [torchserve docker](docker/README.md) for details.
   * Microsoft [DeepSpeed](examples/large_models/deepspeed), [DeepSpeed-Mii](examples/large_models/deepspeed_mii)
   * Hugging Face [Accelerate](examples/large_models/Huggingface_accelerate), [Diffusers](examples/diffusers)
   * Running large models on AWS [Sagemaker](https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-tutorials-torchserve.html) and [Inferentia2](https://pytorch.org/blog/high-performance-llama/)
-  * Running [Llama 2 Chatbot locally on Mac](examples/LLM/llama2)
+  * Running [Meta Llama Chatbot locally on Mac](examples/LLM/llama)
 * Monitoring using Grafana and [Datadog](https://www.datadoghq.com/blog/ai-integrations/#model-serving-and-deployment-vertex-ai-amazon-sagemaker-torchserve)
 
 
@@ -90,8 +90,8 @@ Refer to [torchserve docker](docker/README.md) for details.
 
 
 ## 🏆 Highlighted Examples
-* [Serving Llama 2 with TorchServe](examples/LLM/llama2/README.md)
-* [Chatbot with Llama 2 on Mac 🦙💬](examples/LLM/llama2/chat_app)
+* [Serving Meta Llama with TorchServe](examples/LLM/llama/README.md)
+* [Chatbot with Meta Llama on Mac 🦙💬](examples/LLM/llama/chat_app)
 * [🤗 HuggingFace Transformers](examples/Huggingface_Transformers) with a [Better Transformer Integration/ Flash Attention & Xformer Memory Efficient ](examples/Huggingface_Transformers#Speed-up-inference-with-Better-Transformer)
 * [Stable Diffusion](examples/diffusers)
 * [Model parallel inference](examples/Huggingface_Transformers#model-parallelism)
 
@@ -1,13 +1,13 @@
-# Llama 2: Next generation of Meta's Language Model
-![Llama 2](./images/llama.png)
+# Meta Llama: Next generation of Meta's Language Model
+![Llama](./images/llama.png)
 
-TorchServe supports serving Llama 2 in a number of ways. The examples covered in this document range from someone new to TorchServe learning how to serve Llama 2 with an app, to an advanced user of TorchServe using micro batching and streaming response with Llama 2
+TorchServe supports serving Meta Llama in a number of ways. The examples covered in this document range from someone new to TorchServe learning how to serve Meta Llama with an app, to an advanced user of TorchServe using micro batching and streaming response with Meta Llama.
 
-## 🦙💬 Llama 2 Chatbot
+## 🦙💬 Meta Llama Chatbot
 
-### [Example Link](https://github.com/pytorch/serve/tree/master/examples/LLM/llama2/chat_app)
+### [Example Link](https://github.com/pytorch/serve/tree/master/examples/LLM/llama/chat_app)
 
-This example shows how to deploy a llama2 chat app using TorchServe.
+This example shows how to deploy a llama chat app using TorchServe.
 We use [streamlit](https://github.com/streamlit/streamlit) to create the app
 
 This example is  using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python).
@@ -16,11 +16,11 @@ You can run this example on your laptop to understand how to use TorchServe, how
 
 ![Chatbot Architecture](./chat_app/screenshots/architecture.png)
 
-## Llama 2 with HuggingFace
+## Meta Llama with HuggingFace
 
-### [Example Link](https://github.com/pytorch/serve/tree/master/examples/large_models/Huggingface_accelerate/llama2)
+### [Example Link](https://github.com/pytorch/serve/tree/master/examples/large_models/Huggingface_accelerate/llama)
 
-This example shows how to serve Llama 2 - 70b model with limited resource using [HuggingFace](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf). It shows the following optimizations
+This example shows how to serve meta-llama/Meta-Llama-3-70B-Instruct model with limited resource using [HuggingFace](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct). It shows the following optimizations
     1) HuggingFace `accelerate`. This option can be activated with `low_cpu_mem_usage=True`.
     2) Quantization from [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes)  using `load_in_8bit=True`
 The model is first created on the Meta device (with empty weights) and the state dict is then loaded inside it (shard by shard in the case of a sharded checkpoint).
 
@@ -1,7 +1,7 @@
 
-# TorchServe Llama 2 Chatapp
+# TorchServe Llama Chatapp
 
-This is an example showing how to deploy a llama2 chat app using TorchServe.
+This is an example showing how to deploy a Llama chat app using TorchServe.
 We use [streamlit](https://github.com/streamlit/streamlit) to create the app
 
 We are using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) in this example
@@ -17,21 +17,21 @@ To get started with TorchServe, you need to run the following
 # 1: Set HF Token as Env variable
 export HUGGINGFACE_TOKEN=<Token> # get this from your HuggingFace account
 
-# 2: Build TorchServe Image for Serving llama2-7b model with 4-bit quantization
-./examples/llm/llama2/chat_app/docker/build_image.sh meta-llama/Llama-2-7b-chat-hf
+# 2: Build TorchServe Chat Bot Image for Serving
+./examples/LLM/llama/chat_app/docker/build_image.sh
 
 # 3: Launch the streamlit app for server & client
-docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v <model-store>:/home/model-server/model-store pytorch/torchserve:meta-llama---Llama-2-7b-chat-hf
+docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v <model-store>:/home/model-server/model-store -e MODEL_NAME=meta-llama/Meta-Llama-3-8B-Instruct pytorch/torchserve:chat_bot
 ```
 In step 3, `<model-store>` is a location where you want the model to be downloaded
 
 ### What to expect
 This launches two streamlit apps
 1. TorchServe Server app to start/stop TorchServe, load model, scale up/down workers, configure dynamic batch_size ( Currently llama-cpp-python doesn't support batch_size > 1)
-    - Since this app is targeted for Apple M1/M2 laptops, we load a 4-bit quantized version of llama2 using llama-cpp-python.
+    - Since this app is targeted for Apple M1/M2 laptops, we load a 4-bit quantized version of llama using llama-cpp-python.
 2.  Client chat app where you can chat with the model . There is a slider to send concurrent requests to the model. The current app doesn't have a good mechanism to show multiple responses in parallel. You can notice streaming response for the first request followed by a complete response for the next request.
 
-Currently, this launches llama2-7b model with 4-bit quantization running on CPU.
+Currently, this launches Meta-Llama-3-8B-Instruct with 4-bit quantization running on CPU.
 
 To make use of M1/M2 GPU, you can follow the below guide to do a standalone TorchServe installation.
 
@@ -55,8 +55,8 @@ javac 17.0.8
 You can download it from [java](https://www.oracle.com/java/technologies/downloads/#jdk17-mac)
 2) Install conda with support for arm64
 
-3) Since we are running this example on Mac, we will use the 7B llama2 model.
-Download llama2-7b weights by following instructions [here](https://github.com/pytorch/serve/tree/master/examples/large_models/Huggingface_accelerate/llama2#step-1-download-model-permission)
+3) Since we are running this example on Mac, we will use the Meta-Llama-3-8B-Instruct model.
+Download Meta-Llama-3-8B-Instruct weights by following instructions [here](https://github.com/pytorch/serve/tree/master/examples/large_models/Huggingface_accelerate/llama#step-1-download-model-permission)
 
 4) Install streamlit with
 
@@ -80,9 +80,9 @@ pip install torchserve torch-model-archiver torch-workflow-archiver
 Run this script to create `llamacpp.tar.gz` to be loaded in TorchServe
 
 ```
-source package_llama.sh <path to llama2 snapshot folder>
+source package_llama.sh <path to llama snapshot folder>
 ```
-This creates the quantized weights in `$LLAMA2_WEIGHTS`
+This creates the quantized weights in `$LLAMA_WEIGHTS`
 
 For subsequent runs, we don't need to regenerate these weights. We only need to package the handler, model-config.yaml in the tar file.
 
@@ -97,7 +97,7 @@ You might need to run the below command if the script output indicates it.
 sudo xcodebuild -license
 ```
 
-The script is setting an env variable `LLAMA2_Q4_MODEL` and using this in the handler. In an actual use-case, you would set the path to the weights in `model-config.yaml`
+The script is setting an env variable `LLAMA_Q4_MODEL` and using this in the handler. In an actual use-case, you would set the path to the weights in `model-config.yaml`
 
 ```
 handler:
 
@@ -3,20 +3,23 @@ ARG BASE_IMAGE=pytorch/torchserve:latest-gpu
 FROM $BASE_IMAGE as server
 ARG BASE_IMAGE
 ARG EXAMPLE_DIR
-ARG MODEL_NAME
 ARG HUGGINGFACE_TOKEN
 
 USER root
 
-ENV MODEL_NAME=$MODEL_NAME
-
 RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
     apt-get update && \
     apt-get install libopenmpi-dev git -y
 
 COPY $EXAMPLE_DIR/requirements.txt /home/model-server/chat_bot/requirements.txt
 RUN pip install -r /home/model-server/chat_bot/requirements.txt && huggingface-cli login --token $HUGGINGFACE_TOKEN
 
+WORKDIR /home/model-server/chat_bot
+RUN git clone https://github.com/ggerganov/llama.cpp.git build && \
+    cd build && \
+    make && \
+    python -m pip install -r requirements.txt
+
 COPY $EXAMPLE_DIR  /home/model-server/chat_bot
 COPY $EXAMPLE_DIR/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh
 COPY $EXAMPLE_DIR/config.properties /home/model-server/config.properties
 
@@ -0,0 +1,53 @@
+import argparse
+import os
+
+from huggingface_hub import HfApi, snapshot_download
+
+
+def dir_path(path_str):
+    if os.path.isdir(path_str):
+        return path_str
+    elif input(f"{path_str} does not exist, create directory? [y/n]").lower() == "y":
+        os.makedirs(path_str)
+        return path_str
+    else:
+        raise NotADirectoryError(path_str)
+
+
+class HFModelNotFoundError(Exception):
+    def __init__(self, model_str):
+        super().__init__(f"HuggingFace model not found: '{model_str}'")
+
+
+def hf_model(model_str):
+    api = HfApi()
+    models = [m.modelId for m in api.list_models()]
+    if model_str in models:
+        return model_str
+    else:
+        raise HFModelNotFoundError(model_str)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model_path",
+    "-o",
+    type=dir_path,
+    default="model",
+    help="Output directory for downloaded model files",
+)
+parser.add_argument(
+    "--model_name", "-m", type=hf_model, required=True, help="HuggingFace model name"
+)
+parser.add_argument("--revision", "-r", type=str, default="main", help="Revision")
+args = parser.parse_args()
+
+snapshot_path = snapshot_download(
+    repo_id=args.model_name,
+    revision=args.revision,
+    cache_dir=args.model_path,
+    use_auth_token=True,
+    ignore_patterns=["original/*", "pytorch_model*.bin"],
+)
+
+print(f"Files for '{args.model_name}' is downloaded to '{snapshot_path}'")
@@ -1,17 +1,8 @@
 #!/bin/bash
 
-# Check if there are enough arguments
-if [ "$#" -eq 0 ] || [ "$#" -gt 1 ]; then
-  echo "Usage: $0 <HF Model>"
-  exit 1
-fi
-
-MODEL_NAME=$(echo "$1" | sed 's/\//---/g')
-echo "Model: " $MODEL_NAME
-
 BASE_IMAGE="pytorch/torchserve:latest-cpu"
 
-DOCKER_TAG="pytorch/torchserve:${MODEL_NAME}"
+DOCKER_TAG="pytorch/torchserve:chat_bot"
 
 # Get relative path of example dir
 EXAMPLE_DIR=$(dirname "$(readlink -f "$0")")
@@ -20,9 +11,10 @@ ROOT_DIR=$(realpath "$ROOT_DIR")
 EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|")
 
 # Build docker image for the application
-DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --file ${EXAMPLE_DIR}/Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" --build-arg MODEL_NAME="${MODEL_NAME}"  --build-arg HUGGINGFACE_TOKEN -t "${DOCKER_TAG}" .
+DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --file ${EXAMPLE_DIR}/Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" --build-arg HUGGINGFACE_TOKEN -t "${DOCKER_TAG}" .
 
 echo "Run the following command to start the chat bot"
 echo ""
-echo docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v $(pwd)/model_store_1:/home/model-server/model-store $DOCKER_TAG
+echo docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v $(pwd)/model_store_1:/home/model-server/model-store -e MODEL_NAME="meta-llama/Llama-2-7b-chat-hf" $DOCKER_TAG
 echo ""
+echo "Note: You can replace the model identifier as needed"
@@ -6,6 +6,7 @@
 import streamlit as st
 
 MODEL_NAME = os.environ["MODEL_NAME"]
+MODEL_NAME = MODEL_NAME.replace("/", "---")
 
 # App title
 st.set_page_config(page_title="TorchServe Chatbot")
 
@@ -0,0 +1,81 @@
+#!/bin/bash
+set -e
+
+MODEL_DIR=$(echo "$MODEL_NAME" | sed 's/\//---/g')
+
+export LLAMA_Q4_MODEL=/home/model-server/model-store/$MODEL_DIR/model/ggml-model-q4_0.gguf
+
+
+create_model_cfg_yaml() {
+  # Define the YAML content with a placeholder for the model name
+  yaml_content="# TorchServe frontend parameters\nminWorkers: 1\nmaxWorkers: 1\nresponseTimeout: 1200\n#deviceType: \"gpu\"\n#deviceIds: [0,1]\n#torchrun:\n#    nproc-per-node: 1\n\nhandler:\n    model_name: \"${2}\"\n    manual_seed: 40"
+
+  # Create the YAML file
+  echo -e "$yaml_content" > "model-config.yaml"
+}
+
+create_model_archive() {
+    MODEL_DIR=$1
+    echo "Create model archive for ${MODEL_DIR} if it doesn't already exist"
+    if [ -d "/home/model-server/model-store/$MODEL_DIR" ]; then
+        echo "Model archive for $MODEL_DIR exists."
+    fi
+    if [ -d "/home/model-server/model-store/$MODEL_DIR/model" ]; then
+        echo "Model already download"
+        mv /home/model-server/model-store/$MODEL_DIR/model /home/model-server/model-store/
+    else
+        echo "Model needs to be downloaded"
+    fi
+    torch-model-archiver --model-name "$MODEL_DIR" --version 1.0 --handler llama_cpp_handler.py --config-file "model-config.yaml" -r requirements.txt --archive-format no-archive --export-path /home/model-server/model-store -f
+    if [ -d "/home/model-server/model-store/model" ]; then
+        mv /home/model-server/model-store/model /home/model-server/model-store/$MODEL_DIR/
+    fi
+}
+
+download_model() {
+   MODEL_DIR=$1
+   MODEL_NAME=$2
+    if [ -d "/home/model-server/model-store/$MODEL_DIR/model" ]; then
+        echo "Model $MODEL_NAME already downloaded"
+    else
+        echo "Downloading  model $MODEL_NAME"
+        python Download_model.py --model_path /home/model-server/model-store/$MODEL_DIR/model --model_name $MODEL_NAME
+    fi
+}
+
+quantize_model() {
+    if [ ! -f "$LLAMA_Q4_MODEL" ]; then
+        tmp_model_name=$(echo "$MODEL_DIR" | sed 's/---/--/g')
+        directory_path=/home/model-server/model-store/$MODEL_DIR/model/models--$tmp_model_name/snapshots/
+        HF_MODEL_SNAPSHOT=$(find $directory_path -type d -mindepth 1)
+        cd build
+
+        echo "Convert the model to ggml FP16 format"
+        if [[ $MODEL_NAME == *"Meta-Llama-3"* ]]; then
+            python convert.py $HF_MODEL_SNAPSHOT --vocab-type bpe,hfft --outfile ggml-model-f16.gguf
+        else
+            python convert.py $HF_MODEL_SNAPSHOT --outfile ggml-model-f16.gguf
+        fi
+
+        echo "Quantize the model to 4-bits (using q4_0 method)"
+        ./quantize ggml-model-f16.gguf $LLAMA_Q4_MODEL q4_0
+
+        cd ..
+        echo "Saved quantized model weights to $LLAMA_Q4_MODEL"
+    fi
+}
+
+if [[ "$1" = "serve" ]]; then
+    shift 1
+    create_model_cfg_yaml $MODEL_DIR $MODEL_NAME
+    create_model_archive $MODEL_DIR
+    download_model $MODEL_DIR $MODEL_NAME
+    quantize_model
+    streamlit run torchserve_server_app.py --server.port 8084 &
+    streamlit run client_app.py --server.port 8085
+else
+    eval "$@"
+fi
+
+# prevent docker exit
+tail -f /dev/null
@@ -23,7 +23,7 @@ def initialize(self, ctx):
             ctx (context): It is a JSON Object containing information
             pertaining to the model artifacts parameters.
         """
-        model_path = os.environ["LLAMA2_Q4_MODEL"]
+        model_path = os.environ["LLAMA_Q4_MODEL"]
         model_name = ctx.model_yaml_config["handler"]["model_name"]
         seed = int(ctx.model_yaml_config["handler"]["manual_seed"])
         torch.manual_seed(seed)
 
@@ -7,6 +7,7 @@
 import streamlit as st
 
 MODEL_NAME = os.environ["MODEL_NAME"]
+MODEL_NAME = MODEL_NAME.replace("/", "---")
 MODEL = MODEL_NAME.split("---")[1]
 
 # App title
 
@@ -2,12 +2,12 @@
 # Check if the argument is empty or unset
 if [ -z "$1" ]; then
   echo "Missing Mandatory argument: Path to llama weights"
-  echo "Usage: ./package_llama.sh ./model/models--meta-llama--Llama-2-7b-chat-hf/snapshots/08751db2aca9bf2f7f80d2e516117a53d7450235"
+  echo "Usage: ./package_llama.sh ./models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298"
   exit 1
 fi
 
 MODEL_GENERATION="true"
-LLAMA2_WEIGHTS="$1"
+LLAMA_WEIGHTS="$1"
 
 if [ -n "$2" ]; then
   MODEL_GENERATION="$2"
@@ -20,18 +20,22 @@ if [ "$MODEL_GENERATION" = "true" ]; then
   rm -rf build
   git clone https://github.com/ggerganov/llama.cpp.git build
   cd build
-  make 
+  make
   python -m pip install -r requirements.txt
-  
-  echo "Convert the 7B model to ggml FP16 format"
-  python convert.py $LLAMA2_WEIGHTS --outfile ggml-model-f16.gguf
-  
+
+  echo "Convert the model to ggml FP16 format"
+  if [[ $MODEL_NAME == *"Meta-Llama-3"* ]]; then
+      python convert.py $HF_MODEL_SNAPSHOT --vocab-type bpe,hfft --outfile ggml-model-f16.gguf
+  else
+      python convert.py $HF_MODEL_SNAPSHOT --outfile ggml-model-f16.gguf
+  fi
+
   echo "Quantize the model to 4-bits (using q4_0 method)"
   ./quantize ggml-model-f16.gguf ../ggml-model-q4_0.gguf q4_0
-  
+
   cd ..
-  export LLAMA2_Q4_MODEL=$PWD/ggml-model-q4_0.gguf
-  echo "Saved quantized model weights to $LLAMA2_Q4_MODEL"
+  export LLAMA_Q4_MODEL=$PWD/ggml-model-q4_0.gguf
+  echo "Saved quantized model weights to $LLAMA_Q4_MODEL"
 fi
 
 echo "Creating torchserve model archive"
@@ -43,4 +47,3 @@ if [ "$MODEL_GENERATION" = "true" ]; then
   echo "Cleaning up build of llama-cpp"
   rm -rf build
 fi
-
 
@@ -0,0 +1 @@
+streamlit>=1.26.0