pytorch
diff --git a/‎.github/workflows/benchmark_nightly.yml
+3-3 b/‎.github/workflows/benchmark_nightly.yml
+3-3
diff --git a/‎.github/workflows/ci_cpu.yml
+10-4 b/‎.github/workflows/ci_cpu.yml
+10-4
diff --git a/‎.github/workflows/ci_gpu.yml
+3-3 b/‎.github/workflows/ci_gpu.yml
+3-3
diff --git a/‎.github/workflows/codeql.yml
+4-4 b/‎.github/workflows/codeql.yml
+4-4
diff --git a/‎.github/workflows/doc-automation.yml
+3-3 b/‎.github/workflows/doc-automation.yml
+3-3
diff --git a/‎.github/workflows/docker-nightly-build.yml
+3-3 b/‎.github/workflows/docker-nightly-build.yml
+3-3
diff --git a/‎.github/workflows/kserve_cpu_tests.yml
+3-3 b/‎.github/workflows/kserve_cpu_tests.yml
+3-3
diff --git a/‎.github/workflows/lint.yml
+3-3 b/‎.github/workflows/lint.yml
+3-3
diff --git a/‎.github/workflows/official_release_docker.yml
+3-3 b/‎.github/workflows/official_release_docker.yml
+3-3
diff --git a/‎.github/workflows/regression_tests_cpu.yml
+11-5 b/‎.github/workflows/regression_tests_cpu.yml
+11-5
diff --git a/‎.github/workflows/regression_tests_gpu.yml
+3-3 b/‎.github/workflows/regression_tests_gpu.yml
+3-3
diff --git a/‎.gitmodules
+3 b/‎.gitmodules
+3
diff --git a/‎README.md
+10-8 b/‎README.md
+10-8
diff --git a/‎benchmarks/README.md
+1-1 b/‎benchmarks/README.md
+1-1
diff --git a/‎cpp/CMakeLists.txt
+1-1 b/‎cpp/CMakeLists.txt
+1-1
@@ -24,10 +24,10 @@ jobs:
           cd ..
           pwd
           rm -rf _tool
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v4
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.9
           architecture: x64
       - name: Setup Java 17
         uses: actions/setup-java@v3
 
@@ -21,12 +21,18 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, macOS-latest]
+        os: [ubuntu-20.04, macOS-latest, macos-14]
     steps:
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v4
+      - name: Setup Python for M1
+        if: matrix.os == 'macos-14'
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: '3.10'
+      - name: Setup Python for all other OS
+        if: matrix.os != 'macos-14'
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
           architecture: x64
       - name: Setup Java 17
         uses: actions/setup-java@v3
 
@@ -27,10 +27,10 @@ jobs:
           # deletes all hidden files in a directory
           sudo rm -rf ./.??* || true
           ls -la ./
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v4
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.9
           architecture: x64
       - name: Setup Java 17
         uses: actions/setup-java@v3
 
@@ -36,11 +36,11 @@ jobs:
       uses: actions/checkout@v3
       with:
         submodules: recursive
-      
-    - name: Setup Python 3.8
-      uses: actions/setup-python@v4
+
+    - name: Setup Python 3.9
+      uses: actions/setup-python@v5
       with:
-        python-version: 3.8
+        python-version: 3.9
         architecture: x64
     - name: Setup Java 17
       uses: actions/setup-java@v3
 
@@ -8,10 +8,10 @@ jobs:
   build_docs_job:
     runs-on: ubuntu-20.04
     steps:
-    - name: Setup Python 3.8
-      uses: actions/setup-python@v4
+    - name: Setup Python 3.9
+      uses: actions/setup-python@v5
       with:
-        python-version: 3.8
+        python-version: 3.9
         architecture: x64
     - name: Checkout
       uses: actions/checkout@v3
 
@@ -15,10 +15,10 @@ jobs:
           sudo rm -rf ./* || true
           sudo rm -rf ./.??* || true
           ls -la ./
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v4
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.9
           architecture: x64
       - name: Checkout TorchServe
         uses: actions/checkout@v3
 
@@ -24,10 +24,10 @@ jobs:
           curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
           sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
           echo "/usr/local/bin" >> $GITHUB_PATH
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v4
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.9
           architecture: x64
       - name: Install grpcurl
         run: |
 
@@ -14,10 +14,10 @@ jobs:
   mypy:
     runs-on: ubuntu-20.04
     steps:
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v4
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.9
           architecture: x64
       - name: Checkout TorchServe
         uses: actions/checkout@v3
 
@@ -25,10 +25,10 @@ jobs:
           sudo rm -rf ./* || true
           sudo rm -rf ./.??* || true
           ls -la ./
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v4
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.9
           architecture: x64
       - name: Checkout TorchServe
         uses: actions/checkout@v3
 
@@ -15,17 +15,23 @@ concurrency:
 
 jobs:
   regression-cpu:
-    # creates workflows for OS: ubuntu, macOS
+    # creates workflows for OS: ubuntu, macOS, macOS M1
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, macOS-latest]
+        os: [ubuntu-20.04, macOS-latest, macos-14]
     steps:
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v3
+      - name: Setup Python for M1
+        if: matrix.os == 'macos-14'
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: '3.10'
+      - name: Setup Python for all other OS
+        if: matrix.os != 'macos-14'
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
           architecture: x64
       - name: Setup Java 17
         uses: actions/setup-java@v3
 
@@ -30,10 +30,10 @@ jobs:
         run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt-get update && sudo apt-get install git -y
       - name: Check git version
         run: git --version
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v3
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.9
           architecture: x64
       - name: Setup Java 17
         uses: actions/setup-java@v3
 
@@ -7,3 +7,6 @@
 [submodule "cpp/third-party/llama2.c"]
 	path = cpp/third-party/llama2.c
 	url = https://github.com/karpathy/llama2.c
+[submodule "cpp/third-party/llama2.so"]
+	path = cpp/third-party/llama2.so
+	url = https://github.com/mreso/llama2.so.git
@@ -4,6 +4,7 @@
 ![Docker Nightly build](https://github.com/pytorch/serve/actions/workflows/docker-nightly-build.yml/badge.svg)
 ![Benchmark Nightly](https://github.com/pytorch/serve/actions/workflows/benchmark_nightly.yml/badge.svg)
 ![Docker Regression Nightly](https://github.com/pytorch/serve/actions/workflows/regression_tests_docker.yml/badge.svg)
+![KServe Regression Nightly](https://github.com/pytorch/serve/actions/workflows/kserve_cpu_tests.yml/badge.svg)
 
 TorchServe is a flexible and easy-to-use tool for serving and scaling PyTorch models in production.
 
@@ -55,26 +56,27 @@ docker pull pytorch/torchserve-nightly
 Refer to [torchserve docker](docker/README.md) for details.
 
 ## ⚡ Why TorchServe
-* Write once, run anywhere, on-prem, on-cloud, supports inference on CPUs, GPUs, AWS Inf1/Inf2/Trn1, Google Cloud TPUs, [Nvidia MPS](master/docs/nvidia_mps.md)
+* Write once, run anywhere, on-prem, on-cloud, supports inference on CPUs, GPUs, AWS Inf1/Inf2/Trn1, Google Cloud TPUs, [Nvidia MPS](docs/nvidia_mps.md)
 * [Model Management API](docs/management_api.md): multi model management with optimized worker to model allocation
 * [Inference API](docs/inference_api.md): REST and gRPC support for batched inference
 * [TorchServe Workflows](examples/Workflows/README.md): deploy complex DAGs with multiple interdependent models
 * Default way to serve PyTorch models in
   * [Sagemaker](https://aws.amazon.com/blogs/machine-learning/serving-pytorch-models-in-production-with-the-amazon-sagemaker-native-torchserve-integration/)
   * [Vertex AI](https://cloud.google.com/blog/topics/developers-practitioners/pytorch-google-cloud-how-deploy-pytorch-models-vertex-ai)
-  * [Kubernetes](master/kubernetes) with support for [autoscaling](kubernetes#session-affinity-with-multiple-torchserve-pods), session-affinity, monitoring using Grafana works on-prem, AWS EKS, Google GKE, Azure AKS
+  * [Kubernetes](kubernetes) with support for [autoscaling](kubernetes#session-affinity-with-multiple-torchserve-pods), session-affinity, monitoring using Grafana works on-prem, AWS EKS, Google GKE, Azure AKS
   * [Kserve](https://kserve.github.io/website/0.8/modelserving/v1beta1/torchserve/): Supports both v1 and v2 API, [autoscaling and canary deployments](kubernetes/kserve/README.md#autoscaling) for A/B testing
-  * [Kubeflow](https://v0-5.kubeflow.org/docs/components/pytorchserving/) 
+  * [Kubeflow](https://v0-5.kubeflow.org/docs/components/pytorchserving/)
   * [MLflow](https://github.com/mlflow/mlflow-torchserve)
 * Export your model for optimized inference. Torchscript out of the box, [PyTorch Compiler](examples/pt2/README.md) preview, [ORT and ONNX](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [IPEX](https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch), [TensorRT](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [FasterTransformer](https://github.com/pytorch/serve/tree/master/examples/FasterTransformer_HuggingFace_Bert), FlashAttention (Better Transformers)
 * [Performance Guide](docs/performance_guide.md): builtin support to optimize, benchmark, and profile PyTorch and TorchServe performance
 * [Expressive handlers](CONTRIBUTING.md): An expressive handler architecture that makes it trivial to support inferencing for your use case with [many supported out of the box](https://github.com/pytorch/serve/tree/master/ts/torch_handler)
-* [Metrics API](docs/metrics.md): out-of-the-box support for system-level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics, 
+* [Metrics API](docs/metrics.md): out-of-the-box support for system-level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics,
 * [Large Model Inference Guide](docs/large_model_inference.md): With support for GenAI, LLMs including
+  * [SOTA GenAI performance](https://github.com/pytorch/serve/tree/master/examples/pt2#torchcompile-genai-examples) using `torch.compile`
   * Fast Kernels with FlashAttention v2, continuous batching and streaming response
-  * PyTorch [Tensor Parallel](examples/large_models/tp_llama) preview, [Pipeline Parallel](examples/large_models/Huggingface_pippy) 
-  * Microsoft [DeepSpeed](examples/large_models/deepspeed), [DeepSpeed-Mii](examples/large_models/deepspeed_mii) 
-  * Hugging Face [Accelerate](large_models/Huggingface_accelerate), [Diffusers](examples/diffusers) 
+  * PyTorch [Tensor Parallel](examples/large_models/tp_llama) preview, [Pipeline Parallel](examples/large_models/Huggingface_pippy)
+  * Microsoft [DeepSpeed](examples/large_models/deepspeed), [DeepSpeed-Mii](examples/large_models/deepspeed_mii)
+  * Hugging Face [Accelerate](examples/large_models/Huggingface_accelerate), [Diffusers](examples/diffusers)
   * Running large models on AWS [Sagemaker](https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-tutorials-torchserve.html) and [Inferentia2](https://pytorch.org/blog/high-performance-llama/)
   * Running [Llama 2 Chatbot locally on Mac](examples/LLM/llama2)
 * Monitoring using Grafana and [Datadog](https://www.datadoghq.com/blog/ai-integrations/#model-serving-and-deployment-vertex-ai-amazon-sagemaker-torchserve)
@@ -113,7 +115,7 @@ To learn more about how to contribute, see the contributor guide [here](https://
 ## 📰 News
 * [High performance Llama 2 deployments with AWS Inferentia2 using TorchServe](https://pytorch.org/blog/high-performance-llama/)
 * [Naver Case Study: Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance](https://pytorch.org/blog/ml-model-server-resource-saving/)
-* [Run multiple generative AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe and save up to 75% in inference costs](https://aws.amazon.com/blogs/machine-learning/run-multiple-generative-ai-models-on-gpu-using-amazon-sagemaker-multi-model-endpoints-with-torchserve-and-save-up-to-75-in-inference-costs/)
+* [Run multiple generative AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe and save up to 75% in inference costs](https://pytorch.org/blog/amazon-sagemaker-w-torchserve/)
 * [Deploying your Generative AI model in only four steps with Vertex AI and PyTorch](https://cloud.google.com/blog/products/ai-machine-learning/get-your-genai-model-going-in-four-easy-steps)
 * [PyTorch Model Serving on Google Cloud TPU v5](https://cloud.google.com/tpu/docs/v5e-inference#pytorch-model-inference-and-serving)
 * [Monitoring using Datadog](https://www.datadoghq.com/blog/ai-integrations/#model-serving-and-deployment-vertex-ai-amazon-sagemaker-torchserve)
 
@@ -4,7 +4,7 @@ The benchmarks measure the performance of TorchServe on various models and bench
 
 We currently support benchmarking with JMeter, Apache Bench and Locust. One can also profile backend code with snakeviz.
 
-* [Benchmarking with Apache Bench](#benchmarking-with-apache-bench)
+* [Benchmarking with Locust/Apache Bench](#benchmarking-with-locustapache-bench)
 * [Auto Benchmarking with Apache Bench](#auto-benchmarking-with-apache-bench)
 * [Benchmarking and Profiling with JMeter](jmeter.md)
 
 
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 project(torchserve_cpp VERSION 0.1)
 
 set(CMAKE_CXX_STANDARD 17)
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)`
	`1`	`+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)`
`2`	`2`	`project(torchserve_cpp VERSION 0.1)`
`3`	`3`
`4`	`4`	`set(CMAKE_CXX_STANDARD 17)`