pytorch
diff --git a/‎README.md
+13-2 b/‎README.md
+13-2
diff --git a/‎benchmarks/utils/system_under_test.py
+26 b/‎benchmarks/utils/system_under_test.py
+26
diff --git a/‎docker/Dockerfile
+2-2 b/‎docker/Dockerfile
+2-2
diff --git a/‎examples/large_models/trt_llm/llama/README.md
+18-15 b/‎examples/large_models/trt_llm/llama/README.md
+18-15
diff --git a/‎examples/large_models/trt_llm/llama/model-config.yaml
+4-3 b/‎examples/large_models/trt_llm/llama/model-config.yaml
+4-3
diff --git a/‎examples/large_models/trt_llm/llama/prompt.json
+2-1 b/‎examples/large_models/trt_llm/llama/prompt.json
+2-1
diff --git a/‎examples/large_models/trt_llm/llama/trt_llm_handler.py
-118 b/‎examples/large_models/trt_llm/llama/trt_llm_handler.py
-118
diff --git a/‎examples/large_models/trt_llm/lora/README.md
+83 b/‎examples/large_models/trt_llm/lora/README.md
+83
@@ -62,12 +62,23 @@ Refer to [torchserve docker](docker/README.md) for details.
 
 ### 🤖 Quick Start LLM Deployment
 
+#### VLLM Engine
 ```bash
 # Make sure to install torchserve with pip or conda as described above and login with `huggingface-cli login`
-python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
+python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3.1-8B-Instruct --disable_token_auth
 
 # Try it out
-curl -X POST -d '{"model":"meta-llama/Meta-Llama-3-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
+curl -X POST -d '{"model":"meta-llama/Meta-Llama-3.1-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
+```
+
+#### TRT-LLM Engine
+```bash
+# Make sure to install torchserve with python venv as described above and login with `huggingface-cli login`
+# pip install -U --use-deprecated=legacy-resolver -r requirements/trt_llm.txt
+python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3.1-8B-Instruct --engine trt_llm --disable_token_auth
+
+# Try it out
+curl -X POST -d '{"prompt":"count from 1 to 9 in french ", "max_tokens": 100}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
 ```
 
 ### 🚢 Quick Start LLM Deployment with Docker
 
@@ -113,6 +113,7 @@ def start(self):
         execute("torchserve --stop", wait=True)
         click.secho("*Setting up model store...", fg="green")
         self._prepare_local_dependency()
+        self._clear_neuron_cache_if_exists()
         click.secho("*Starting local Torchserve instance...", fg="green")
 
         ts_cmd = (
@@ -141,6 +142,31 @@ def start(self):
                 if "Model server started" in str(line).strip():
                     break
 
+    def _clear_neuron_cache_if_exists(self):
+        cache_dir = "/var/tmp/neuron-compile-cache/"
+
+        # Check if the directory exists
+        if os.path.exists(cache_dir) and os.path.isdir(cache_dir):
+            click.secho(
+                f"Directory {cache_dir} exists. Clearing contents...", fg="green"
+            )
+
+            # Remove the directory contents
+            for filename in os.listdir(cache_dir):
+                file_path = os.path.join(cache_dir, filename)
+                try:
+                    if os.path.isfile(file_path) or os.path.islink(file_path):
+                        os.unlink(file_path)
+                    elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)
+                except Exception as e:
+                    click.secho(f"Failed to delete {file_path}. Reason: {e}", fg="red")
+            click.secho(f"Cache cleared: {cache_dir}", fg="green")
+        else:
+            click.secho(
+                f"Directory {cache_dir} does not exist. No action taken.", fg="green"
+            )
+
     def stop(self):
         click.secho("*Terminating Torchserve instance...", fg="green")
         execute("torchserve --stop", wait=True)
 
@@ -73,7 +73,7 @@ COPY ./ serve
 RUN \
     if echo "$LOCAL_CHANGES" | grep -q "false"; then \
         rm -rf serve;\
-        git clone --recursive $REPO_URL -b $BRANCH_NAME; \
+        git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \
     fi
 
 
@@ -238,7 +238,7 @@ COPY ./ serve
 RUN \
     if echo "$LOCAL_CHANGES" | grep -q "false"; then \
         rm -rf serve;\
-        git clone --recursive $REPO_URL -b $BRANCH_NAME; \
+        git clone --recursive $REPO_URL -b $BRANCH_NAME serve; \
     fi
 
 COPY --from=compile-image /home/venv /home/venv
 
@@ -4,19 +4,19 @@
 
 ## Pre-requisites
 
-TRT-LLM requires Python 3.10
+- TRT-LLM requires Python 3.10
+- TRT-LLM works well with python venv (vs conda)
 This example is tested with CUDA 12.1
 Once TorchServe is installed, install TensorRT-LLM using the following.
-This will downgrade the versions of PyTorch & Triton but this doesn't cause any issue.
 
 ```
-pip install tensorrt_llm==0.10.0 --extra-index-url https://pypi.nvidia.com
-pip install tensorrt-cu12==10.1.0
+pip install tensorrt_llm -U --pre --extra-index-url https://pypi.nvidia.com
+pip install transformers>=4.44.2
 python -c "import tensorrt_llm"
 ```
 shows
 ```
-[TensorRT-LLM] TensorRT-LLM version: 0.10.0
+[TensorRT-LLM] TensorRT-LLM version: 0.13.0.dev2024090300
 ```
 
 ## Download model from HuggingFace
@@ -26,29 +26,32 @@ huggingface-cli login
 huggingface-cli login --token $HUGGINGFACE_TOKEN
 ```
 ```
-python ../../utils/Download_model.py --model_path model --model_name meta-llama/Meta-Llama-3-8B-Instruct
+python ../../utils/Download_model.py --model_path model --model_name meta-llama/Meta-Llama-3.1-8B-Instruct --use_auth_token True
 ```
 
 ## Create TensorRT-LLM Engine
 Clone TensorRT-LLM which will be used to create the TensorRT-LLM Engine
 
 ```
-git clone -b v0.10.0 https://github.com/NVIDIA/TensorRT-LLM.git
+git clone https://github.com/NVIDIA/TensorRT-LLM.git
 ```
 
 Compile the model into a TensorRT engine with model weights and a model definition written in the TensorRT-LLM Python API.
 
 ```
-python TensorRT-LLM/examples/llama/convert_checkpoint.py --model_dir model/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/ --output_dir ./tllm_checkpoint_1gpu_bf16 --dtype bfloat16
+python TensorRT-LLM/examples/llama/convert_checkpoint.py --model_dir model/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/ --output_dir ./tllm_checkpoint_1gpu_bf16 --dtype bfloat16
 ```
+
 ```
-trtllm-build --checkpoint_dir tllm_checkpoint_1gpu_bf16 --gemm_plugin bfloat16 --gpt_attention_plugin bfloat16 --output_dir ./llama-3-8b-engine
+trtllm-build --checkpoint_dir tllm_checkpoint_1gpu_bf16 --gemm_plugin bfloat16 --gpt_attention_plugin bfloat16 --max_batch_size 4  --output_dir ./llama-3.1-8b-engine
 ```
+If you have enough GPU memory, you can try increasing the `max_batch_size`
 
 You can test if TensorRT-LLM Engine has been compiled correctly by running the following
 ```
-python TensorRT-LLM/examples/run.py --engine_dir ./llama-3-8b-engine  --max_output_len 100 --tokenizer_dir model/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/ --input_text "How do I count to nine in French?"
+python TensorRT-LLM/examples/run.py --engine_dir ./llama-3.1-8b-engine  --max_output_len 100 --tokenizer_dir model/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/ --input_text "How do I count to nine in French?"
 ```
+If you are running into OOM, try reducing `kv_cache_free_gpu_memory_fraction`
 
 You should see an output as follows
 ```
@@ -70,17 +73,17 @@ That's it! You can now count to nine in French. Just remember that the numbers o
 
 ```
 mkdir model_store
-torch-model-archiver --model-name llama3-8b --version 1.0 --handler trt_llm_handler.py --config-file model-config.yaml --archive-format no-archive --export-path model_store -f
-mv model model_store/llama3-8b/.
-mv llama-3-8b-engine model_store/llama3-8b/.
+torch-model-archiver --model-name llama3.1-8b --version 1.0 --handler trt_llm_handler --config-file model-config.yaml --archive-format no-archive --export-path model_store -f
+mv model model_store/llama3.1-8b/.
+mv llama-3.1-8b-engine model_store/llama3.1-8b/.
 ```
 
 ## Start TorchServe
 ```
-torchserve --start --ncs --model-store model_store --models llama3-8b --disable-token-auth
+torchserve --start --ncs --model-store model_store --models llama3.1-8b --disable-token-auth
 ```
 
 ## Run Inference
 ```
-python ../../utils/test_llm_streaming_response.py -o 50 -t 2 -n 4 -m llama3-8b --prompt-text "@prompt.json" --prompt-json
+python ../../utils/test_llm_streaming_response.py -o 50 -t 2 -n 4 -m llama3.1-8b --prompt-text "@prompt.json" --prompt-json
 ```
@@ -7,6 +7,7 @@ deviceType: "gpu"
 asyncCommunication: true
 
 handler:
-    tokenizer_dir: "model/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/"
-    trt_llm_engine_config:
-        engine_dir: "llama-3-8b-engine"
+    tokenizer_dir: "model/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/"
+    engine_dir: "llama-3.1-8b-engine"
+    kv_cache_config:
+        free_gpu_memory_fraction: 0.1
@@ -1,3 +1,4 @@
 {"prompt": "How is the climate in San Francisco?",
  "temperature":0.5,
- "max_new_tokens": 200}
+ "max_tokens": 400,
+ "streaming": true}
@@ -0,0 +1,83 @@
+# Llama TensorRT-LLM Engine + LoRA model integration with TorchServe
+
+[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) provides users with an option to build TensorRT engines for LLMs that contain state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.
+
+## Pre-requisites
+
+- TRT-LLM requires Python 3.10
+- TRT-LLM works well with python venv (vs conda)
+This example is tested with CUDA 12.1
+Once TorchServe is installed, install TensorRT-LLM using the following.
+
+```
+pip install tensorrt_llm -U --pre --extra-index-url https://pypi.nvidia.com
+pip install transformers>=4.44.2
+python -c "import tensorrt_llm"
+```
+shows
+```
+[TensorRT-LLM] TensorRT-LLM version: 0.13.0.dev2024090300
+```
+
+## Download Base model & LoRA adapter from Hugging Face
+```
+huggingface-cli login
+# or using an environment variable
+huggingface-cli login --token $HUGGINGFACE_TOKEN
+```
+```
+python ../../utils/Download_model.py --model_path model --model_name meta-llama/Meta-Llama-3.1-8B-Instruct --use_auth_token True
+python ../../utils/Download_model.py --model_path model --model_name llama-duo/llama3.1-8b-summarize-gpt4o-128k --use_auth_token True
+```
+
+## Create TensorRT-LLM Engine
+Clone TensorRT-LLM which will be used to create the TensorRT-LLM Engine
+
+```
+git clone https://github.com/NVIDIA/TensorRT-LLM.git
+```
+
+Compile the model into a TensorRT engine with model weights and a model definition written in the TensorRT-LLM Python API.
+
+```
+python TensorRT-LLM/examples/llama/convert_checkpoint.py --model_dir model/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f --output_dir ./tllm_checkpoint_1gpu_bf16 --dtype bfloat16
+```
+
+```
+trtllm-build --checkpoint_dir tllm_checkpoint_1gpu_bf16 --gemm_plugin bfloat16 --gpt_attention_plugin bfloat16 --output_dir ./llama-3.1-8b-engine-lora --max_batch_size 4 --lora_dir model/models--llama-duo--llama3.1-8b-summarize-gpt4o-128k/snapshots/4ba83353f24fa38946625c8cc49bf21c80a22825 --lora_plugin bfloat16
+```
+If you have enough GPU memory, you can try increasing the `max_batch_size`
+
+You can test if TensorRT-LLM Engine has been compiled correctly by running the following
+```
+python TensorRT-LLM/examples/run.py --engine_dir ./llama-3.1-8b-engine-lora  --max_output_len 100 --tokenizer_dir model/models--llama-duo--llama3.1-8b-summarize-gpt4o-128k/snapshots/4ba83353f24fa38946625c8cc49bf21c80a22825 --input_text "Amanda: I baked  cookies. Do you want some?\nJerry: Sure \nAmanda: I will bring you tomorrow :-)\n\nSummarize the dialog:" --lora_dir model/models--llama-duo--llama3.1-8b-summarize-gpt4o-128k/snapshots/4ba83353f24fa38946625c8cc49bf21c80a22825 --kv_cache_free_gpu_memory_fraction 0.3 --use_py_session
+```
+If you are running into OOM, try reducing `kv_cache_free_gpu_memory_fraction`
+
+You should see an output as follows
+```
+Input [Text 0]: "<|begin_of_text|>Amanda: I baked  cookies. Do you want some?\nJerry: Sure \nAmanda: I will bring you tomorrow :-)\n\nSummarize the dialog:"
+Output [Text 0 Beam 0]: " Amanda offered Jerry cookies and said she would bring them to him tomorrow.
+Amanda offered Jerry cookies and said she would bring them to him tomorrow.
+The dialogue is between Amanda and Jerry. Amanda offers Jerry cookies and says she will bring them to him tomorrow. The dialogue is a simple exchange between two people, with no complex plot or themes. The tone is casual and friendly. The dialogue is a good example of a short, everyday conversation.
+The dialogue is a good example of a short,"
+```
+
+## Create model archive
+
+```
+mkdir model_store
+torch-model-archiver --model-name llama3.1-8b --version 1.0 --handler trt_llm_handler --config-file model-config.yaml --archive-format no-archive --export-path model_store -f
+mv model model_store/llama3.1-8b/.
+mv llama-3.1-8b-engine-lora model_store/llama3.1-8b/.
+```
+
+## Start TorchServe
+```
+torchserve --start --ncs --model-store model_store --models llama3.1-8b --disable-token-auth
+```
+
+## Run Inference
+```
+python ../../utils/test_llm_streaming_response.py -o 50 -t 2 -n 4 -m llama3.1-8b --prompt-text "@prompt.json" --prompt-json
+```