pytorch
diff --git a/‎README.md
+13-2 b/‎README.md
+13-2
diff --git a/‎examples/large_models/trt_llm/llama/README.md
+18-15 b/‎examples/large_models/trt_llm/llama/README.md
+18-15
diff --git a/‎examples/large_models/trt_llm/llama/model-config.yaml
+4-3 b/‎examples/large_models/trt_llm/llama/model-config.yaml
+4-3
diff --git a/‎examples/large_models/trt_llm/llama/prompt.json
+2-1 b/‎examples/large_models/trt_llm/llama/prompt.json
+2-1
diff --git a/‎examples/large_models/trt_llm/llama/trt_llm_handler.py
-118 b/‎examples/large_models/trt_llm/llama/trt_llm_handler.py
-118
diff --git a/‎examples/large_models/trt_llm/lora/README.md
+83 b/‎examples/large_models/trt_llm/lora/README.md
+83
diff --git a/‎examples/large_models/trt_llm/lora/model-config.yaml
+13 b/‎examples/large_models/trt_llm/lora/model-config.yaml
+13
diff --git a/‎examples/large_models/trt_llm/lora/prompt.json
+4 b/‎examples/large_models/trt_llm/lora/prompt.json
+4
diff --git a/‎examples/large_models/vllm/llama3/model-config.yaml
+1-1 b/‎examples/large_models/vllm/llama3/model-config.yaml
+1-1
diff --git a/‎examples/large_models/vllm/lora/Readme.md
+1-1 b/‎examples/large_models/vllm/lora/Readme.md
+1-1
diff --git a/‎examples/large_models/vllm/lora/model-config.yaml
+1-1 b/‎examples/large_models/vllm/lora/model-config.yaml
+1-1
diff --git a/‎examples/large_models/vllm/mistral/model-config.yaml
+1-1 b/‎examples/large_models/vllm/mistral/model-config.yaml
+1-1
diff --git a/‎examples/large_models/vllm/requirements.txt
+1-1 b/‎examples/large_models/vllm/requirements.txt
+1-1
@@ -62,12 +62,23 @@ Refer to [torchserve docker](docker/README.md) for details.
 
 ### 🤖 Quick Start LLM Deployment
 
+#### VLLM Engine
 ```bash
 # Make sure to install torchserve with pip or conda as described above and login with `huggingface-cli login`
-python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
+python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3.1-8B-Instruct --disable_token_auth
 
 # Try it out
-curl -X POST -d '{"model":"meta-llama/Meta-Llama-3-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
+curl -X POST -d '{"model":"meta-llama/Meta-Llama-3.1-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
+```
+
+#### TRT-LLM Engine
+```bash
+# Make sure to install torchserve with python venv as described above and login with `huggingface-cli login`
+# pip install -U --use-deprecated=legacy-resolver -r requirements/trt_llm.txt
+python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3.1-8B-Instruct --engine trt_llm --disable_token_auth
+
+# Try it out
+curl -X POST -d '{"prompt":"count from 1 to 9 in french ", "max_tokens": 100}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
 ```
 
 ### 🚢 Quick Start LLM Deployment with Docker
 
@@ -4,19 +4,19 @@
 
 ## Pre-requisites
 
-TRT-LLM requires Python 3.10
+- TRT-LLM requires Python 3.10
+- TRT-LLM works well with python venv (vs conda)
 This example is tested with CUDA 12.1
 Once TorchServe is installed, install TensorRT-LLM using the following.
-This will downgrade the versions of PyTorch & Triton but this doesn't cause any issue.
 
 ```
-pip install tensorrt_llm==0.10.0 --extra-index-url https://pypi.nvidia.com
-pip install tensorrt-cu12==10.1.0
+pip install tensorrt_llm -U --pre --extra-index-url https://pypi.nvidia.com
+pip install transformers>=4.44.2
 python -c "import tensorrt_llm"
 ```
 shows
 ```
-[TensorRT-LLM] TensorRT-LLM version: 0.10.0
+[TensorRT-LLM] TensorRT-LLM version: 0.13.0.dev2024090300
 ```
 
 ## Download model from HuggingFace
@@ -26,29 +26,32 @@ huggingface-cli login
 huggingface-cli login --token $HUGGINGFACE_TOKEN
 ```
 ```
-python ../../utils/Download_model.py --model_path model --model_name meta-llama/Meta-Llama-3-8B-Instruct
+python ../../utils/Download_model.py --model_path model --model_name meta-llama/Meta-Llama-3.1-8B-Instruct --use_auth_token True
 ```
 
 ## Create TensorRT-LLM Engine
 Clone TensorRT-LLM which will be used to create the TensorRT-LLM Engine
 
 ```
-git clone -b v0.10.0 https://github.com/NVIDIA/TensorRT-LLM.git
+git clone https://github.com/NVIDIA/TensorRT-LLM.git
 ```
 
 Compile the model into a TensorRT engine with model weights and a model definition written in the TensorRT-LLM Python API.
 
 ```
-python TensorRT-LLM/examples/llama/convert_checkpoint.py --model_dir model/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/ --output_dir ./tllm_checkpoint_1gpu_bf16 --dtype bfloat16
+python TensorRT-LLM/examples/llama/convert_checkpoint.py --model_dir model/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/ --output_dir ./tllm_checkpoint_1gpu_bf16 --dtype bfloat16
 ```
+
 ```
-trtllm-build --checkpoint_dir tllm_checkpoint_1gpu_bf16 --gemm_plugin bfloat16 --gpt_attention_plugin bfloat16 --output_dir ./llama-3-8b-engine
+trtllm-build --checkpoint_dir tllm_checkpoint_1gpu_bf16 --gemm_plugin bfloat16 --gpt_attention_plugin bfloat16 --max_batch_size 4  --output_dir ./llama-3.1-8b-engine
 ```
+If you have enough GPU memory, you can try increasing the `max_batch_size`
 
 You can test if TensorRT-LLM Engine has been compiled correctly by running the following
 ```
-python TensorRT-LLM/examples/run.py --engine_dir ./llama-3-8b-engine  --max_output_len 100 --tokenizer_dir model/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/ --input_text "How do I count to nine in French?"
+python TensorRT-LLM/examples/run.py --engine_dir ./llama-3.1-8b-engine  --max_output_len 100 --tokenizer_dir model/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/ --input_text "How do I count to nine in French?"
 ```
+If you are running into OOM, try reducing `kv_cache_free_gpu_memory_fraction`
 
 You should see an output as follows
 ```
@@ -70,17 +73,17 @@ That's it! You can now count to nine in French. Just remember that the numbers o
 
 ```
 mkdir model_store
-torch-model-archiver --model-name llama3-8b --version 1.0 --handler trt_llm_handler.py --config-file model-config.yaml --archive-format no-archive --export-path model_store -f
-mv model model_store/llama3-8b/.
-mv llama-3-8b-engine model_store/llama3-8b/.
+torch-model-archiver --model-name llama3.1-8b --version 1.0 --handler trt_llm_handler --config-file model-config.yaml --archive-format no-archive --export-path model_store -f
+mv model model_store/llama3.1-8b/.
+mv llama-3.1-8b-engine model_store/llama3.1-8b/.
 ```
 
 ## Start TorchServe
 ```
-torchserve --start --ncs --model-store model_store --models llama3-8b --disable-token-auth
+torchserve --start --ncs --model-store model_store --models llama3.1-8b --disable-token-auth
 ```
 
 ## Run Inference
 ```
-python ../../utils/test_llm_streaming_response.py -o 50 -t 2 -n 4 -m llama3-8b --prompt-text "@prompt.json" --prompt-json
+python ../../utils/test_llm_streaming_response.py -o 50 -t 2 -n 4 -m llama3.1-8b --prompt-text "@prompt.json" --prompt-json
 ```
@@ -7,6 +7,7 @@ deviceType: "gpu"
 asyncCommunication: true
 
 handler:
-    tokenizer_dir: "model/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/"
-    trt_llm_engine_config:
-        engine_dir: "llama-3-8b-engine"
+    tokenizer_dir: "model/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f/"
+    engine_dir: "llama-3.1-8b-engine"
+    kv_cache_config:
+        free_gpu_memory_fraction: 0.1
@@ -1,3 +1,4 @@
 {"prompt": "How is the climate in San Francisco?",
  "temperature":0.5,
- "max_new_tokens": 200}
+ "max_tokens": 400,
+ "streaming": true}
@@ -0,0 +1,83 @@
+# Llama TensorRT-LLM Engine + LoRA model integration with TorchServe
+
+[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) provides users with an option to build TensorRT engines for LLMs that contain state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.
+
+## Pre-requisites
+
+- TRT-LLM requires Python 3.10
+- TRT-LLM works well with python venv (vs conda)
+This example is tested with CUDA 12.1
+Once TorchServe is installed, install TensorRT-LLM using the following.
+
+```
+pip install tensorrt_llm -U --pre --extra-index-url https://pypi.nvidia.com
+pip install transformers>=4.44.2
+python -c "import tensorrt_llm"
+```
+shows
+```
+[TensorRT-LLM] TensorRT-LLM version: 0.13.0.dev2024090300
+```
+
+## Download Base model & LoRA adapter from Hugging Face
+```
+huggingface-cli login
+# or using an environment variable
+huggingface-cli login --token $HUGGINGFACE_TOKEN
+```
+```
+python ../../utils/Download_model.py --model_path model --model_name meta-llama/Meta-Llama-3.1-8B-Instruct --use_auth_token True
+python ../../utils/Download_model.py --model_path model --model_name llama-duo/llama3.1-8b-summarize-gpt4o-128k --use_auth_token True
+```
+
+## Create TensorRT-LLM Engine
+Clone TensorRT-LLM which will be used to create the TensorRT-LLM Engine
+
+```
+git clone https://github.com/NVIDIA/TensorRT-LLM.git
+```
+
+Compile the model into a TensorRT engine with model weights and a model definition written in the TensorRT-LLM Python API.
+
+```
+python TensorRT-LLM/examples/llama/convert_checkpoint.py --model_dir model/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f --output_dir ./tllm_checkpoint_1gpu_bf16 --dtype bfloat16
+```
+
+```
+trtllm-build --checkpoint_dir tllm_checkpoint_1gpu_bf16 --gemm_plugin bfloat16 --gpt_attention_plugin bfloat16 --output_dir ./llama-3.1-8b-engine-lora --max_batch_size 4 --lora_dir model/models--llama-duo--llama3.1-8b-summarize-gpt4o-128k/snapshots/4ba83353f24fa38946625c8cc49bf21c80a22825 --lora_plugin bfloat16
+```
+If you have enough GPU memory, you can try increasing the `max_batch_size`
+
+You can test if TensorRT-LLM Engine has been compiled correctly by running the following
+```
+python TensorRT-LLM/examples/run.py --engine_dir ./llama-3.1-8b-engine-lora  --max_output_len 100 --tokenizer_dir model/models--llama-duo--llama3.1-8b-summarize-gpt4o-128k/snapshots/4ba83353f24fa38946625c8cc49bf21c80a22825 --input_text "Amanda: I baked  cookies. Do you want some?\nJerry: Sure \nAmanda: I will bring you tomorrow :-)\n\nSummarize the dialog:" --lora_dir model/models--llama-duo--llama3.1-8b-summarize-gpt4o-128k/snapshots/4ba83353f24fa38946625c8cc49bf21c80a22825 --kv_cache_free_gpu_memory_fraction 0.3 --use_py_session
+```
+If you are running into OOM, try reducing `kv_cache_free_gpu_memory_fraction`
+
+You should see an output as follows
+```
+Input [Text 0]: "<|begin_of_text|>Amanda: I baked  cookies. Do you want some?\nJerry: Sure \nAmanda: I will bring you tomorrow :-)\n\nSummarize the dialog:"
+Output [Text 0 Beam 0]: " Amanda offered Jerry cookies and said she would bring them to him tomorrow.
+Amanda offered Jerry cookies and said she would bring them to him tomorrow.
+The dialogue is between Amanda and Jerry. Amanda offers Jerry cookies and says she will bring them to him tomorrow. The dialogue is a simple exchange between two people, with no complex plot or themes. The tone is casual and friendly. The dialogue is a good example of a short, everyday conversation.
+The dialogue is a good example of a short,"
+```
+
+## Create model archive
+
+```
+mkdir model_store
+torch-model-archiver --model-name llama3.1-8b --version 1.0 --handler trt_llm_handler --config-file model-config.yaml --archive-format no-archive --export-path model_store -f
+mv model model_store/llama3.1-8b/.
+mv llama-3.1-8b-engine-lora model_store/llama3.1-8b/.
+```
+
+## Start TorchServe
+```
+torchserve --start --ncs --model-store model_store --models llama3.1-8b --disable-token-auth
+```
+
+## Run Inference
+```
+python ../../utils/test_llm_streaming_response.py -o 50 -t 2 -n 4 -m llama3.1-8b --prompt-text "@prompt.json" --prompt-json
+```
@@ -0,0 +1,13 @@
+# TorchServe frontend parameters
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 100
+responseTimeout: 1200
+deviceType: "gpu"
+asyncCommunication: true
+
+handler:
+    tokenizer_dir: "model/models--llama-duo--llama3.1-8b-summarize-gpt4o-128k/snapshots/4ba83353f24fa38946625c8cc49bf21c80a22825"
+    engine_dir: "llama-3.1-8b-engine-lora"
+    kv_cache_config:
+        free_gpu_memory_fraction: 0.1
@@ -0,0 +1,4 @@
+{"prompt": "Amanda: I baked  cookies. Do you want some?\nJerry: Sure \nAmanda: I will bring you tomorrow :-)\n\nSummarize the dialog:",
+ "temperature":0.0,
+ "max_new_tokens": 100,
+ "streaming": true}
@@ -2,7 +2,7 @@
 minWorkers: 1
 maxWorkers: 1
 maxBatchDelay: 100
-responseTimeout: 1200
+startupTimeout: 1200
 deviceType: "gpu"
 asyncCommunication: true
 
 
@@ -55,7 +55,7 @@ The vllm integration uses an OpenAI compatible interface which lets you perform
 
 Curl:
 ```bash
-curl --header "Content-Type: application/json"   --request POST   --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1
+curl --header "Content-Type: application/json"   --request POST   --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1/completions
 ```
 
 Python + Request:
 
@@ -2,7 +2,7 @@
 minWorkers: 1
 maxWorkers: 1
 maxBatchDelay: 100
-responseTimeout: 1200
+startupTimeout: 1200
 deviceType: "gpu"
 asyncCommunication: true
 
 
@@ -2,7 +2,7 @@
 minWorkers: 1
 maxWorkers: 1
 maxBatchDelay: 100
-responseTimeout: 1200
+startupTimeout: 1200
 deviceType: "gpu"
 asyncCommunication: true
 
 
@@ -1 +1 @@
-vllm==0.5.0
+vllm==0.6.1.post2