pytorch · Mar 7, 2024
diff --git a/‎.github/workflows/ci-cpu-cpp.yml
+33-8 b/‎.github/workflows/ci-cpu-cpp.yml
+33-8
diff --git a/‎.gitmodules
-3 b/‎.gitmodules
-3
diff --git a/‎cpp/CMakeLists.txt
+15-4 b/‎cpp/CMakeLists.txt
+15-4
diff --git a/‎cpp/README.md
+12-14 b/‎cpp/README.md
+12-14
diff --git a/‎cpp/build.sh
+28-96 b/‎cpp/build.sh
+28-96
diff --git a/‎cpp/third-party/llama.cpp
-1 b/‎cpp/third-party/llama.cpp
-1
diff --git a/‎docs/configuration.md
+1 b/‎docs/configuration.md
+1
diff --git a/‎docs/token_authorization_api.md
+6-2 b/‎docs/token_authorization_api.md
+6-2
diff --git a/‎examples/cpp/llamacpp/CMakeLists.txt
+13-13 b/‎examples/cpp/llamacpp/CMakeLists.txt
+13-13
diff --git a/‎examples/cpp/llamacpp/src/llamacpp_handler.cc
+1-1 b/‎examples/cpp/llamacpp/src/llamacpp_handler.cc
+1-1
diff --git a/‎frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCollector.java
+15-7 b/‎frontend/server/src/main/java/org/pytorch/serve/metrics/MetricCollector.java
+15-7
diff --git a/‎frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java
+8-1 b/‎frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java
+8-1
diff --git a/‎kubernetes/kserve/kserve_wrapper/__main__.py
+21-31 b/‎kubernetes/kserve/kserve_wrapper/__main__.py
+21-31
diff --git a/‎ts_scripts/install_dependencies.py
+6-4 b/‎ts_scripts/install_dependencies.py
+6-4
diff --git a/‎ts_scripts/print_env_info.py
+29 b/‎ts_scripts/print_env_info.py
+29
diff --git a/‎ts_scripts/sanity_utils.py
+49-8 b/‎ts_scripts/sanity_utils.py
+49-8
@@ -16,17 +16,42 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, macOS-latest]
+        os: [ubuntu-20.04]
     steps:
+      # - name: Setup Python for M1
+      #   if: matrix.os == 'macos-14'
+      #   uses: actions/setup-python@v5
+      #   with:
+      #     python-version: '3.10'
+      - name: Setup Python for all other OS
+        if: matrix.os != 'macos-14'
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+          architecture: x64
+      - name: Setup Java 17
+        uses: actions/setup-java@v3
+        with:
+          distribution: 'zulu'
+          java-version: '17'
       - name: Checkout TorchServe
-        uses: actions/checkout@v2
-      - name: Install libtorch - macOS
-        if: matrix.os == 'macOS-latest'
-        run: |
-            brew install libtorch
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      # - name: Install libtorch - macOS
+      #   if: matrix.os == 'macOS-latest'
+      #   run: |
+      #       brew install libtorch
       - name: Install dependencies
         run: |
-          python ts_scripts/install_dependencies.py --environment=dev --cpp
+          sudo apt update && python ts_scripts/install_dependencies.py --environment=dev --cpp
+      - name: Install TorchServe
+        run: |
+          python ts_scripts/install_from_src.py
+      - name: Print Env
+        run: |
+          python ts_scripts/print_env_info.py
       - name: Build
         run: |
-          cd cpp && ./build.sh
+          cd cpp && rm -rf _build && sudo mkdir /mnt/_build && sudo chmod 777 /mnt/_build && mkdir _build && sudo mount --bind /mnt/_build _build
+          ./build.sh
@@ -1,9 +1,6 @@
 [submodule "third_party/google/rpc"]
 	path = third_party/google/rpc
 	url = https://github.com/googleapis/googleapis.git
-[submodule "cpp/third-party/llama.cpp"]
-	path = cpp/third-party/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
 [submodule "cpp/third-party/llama2.c"]
 	path = cpp/third-party/llama2.c
 	url = https://github.com/karpathy/llama2.c
 
@@ -18,10 +18,6 @@ if(CLANG_FORMAT_EXE)
     ${PROJECT_SOURCE_DIR}/test/*.hh
   )
 
-  add_custom_target(format
-    COMMAND
-      ${CLANG_FORMAT_EXE} -i -style=google ${ALL_CXX_SOURCE_FILES}
-  )
 endif()
 
 
@@ -31,6 +27,21 @@ find_package(fmt REQUIRED)
 find_package(gflags REQUIRED)
 find_package(Torch REQUIRED)
 
+include(FetchContent)
+
+FetchContent_Declare(
+  yaml-cpp
+  GIT_REPOSITORY https://github.com/jbeder/yaml-cpp.git
+  GIT_TAG 0.8.0 # Can be a tag (yaml-cpp-x.x.x), a commit hash, or a branch name (master)
+)
+FetchContent_GetProperties(yaml-cpp)
+
+if(NOT yaml-cpp_POPULATED)
+  message(STATUS "Fetching yaml-cpp...")
+  FetchContent_Populate(yaml-cpp)
+  add_subdirectory(${yaml-cpp_SOURCE_DIR} ${yaml-cpp_BINARY_DIR})
+endif()
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 
 include_directories(${TORCH_INCLUDE_DIRS})
 
@@ -5,36 +5,34 @@
 * cmake version: 3.18+
 ## Installation and Running TorchServe CPP
 
+This installation instruction assumes that TorchServe is already installed through pip/conda/source. If this is not the case install it after the `Install dependencies` step through your preferred method.
+
 ### Install dependencies
 ```
 cd serve
 python ts_scripts/install_dependencies.py --cpp --environment dev [--cuda=cu121|cu118]
 ```
 ### Building the backend
+Don't forget to install or update TorchServe at this point if it wasn't previously installed.
 ```
 ## Dev Build
 cd cpp
 ./build.sh [-g cu121|cu118]
 
-## Install TorchServe from source
-cd ..
-python ts_scripts/install_from_src.py
-```
-### Set Environment Var
-#### On Mac
-```
-export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$(pwd)/_build/_deps/libtorch/lib
-```
-#### On Ubuntu
-```
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)/_build/_deps/libtorch/lib
 ```
 
 ### Run TorchServe
 ```
 mkdir model_store
 torchserve --ncs --start --model-store model_store
 ```
+
+### Clean the build directory
+To clean the build directory in order to rebuild from scratch simply delete the cpp/_build directory with
+```
+rm -rf cpp/_build
+```
+
 ## Backend
 TorchServe cpp backend can run as a process, which is similar to [TorchServe Python backend](https://github.com/pytorch/serve/tree/master/ts). By default, TorchServe supports torch scripted model in cpp backend. Other platforms such as MxNet, ONNX can be supported through custom handlers following the TorchScript example [src/backends/handler/torch_scripted_handler.hh](https://github.com/pytorch/serve/blob/master/cpp/src/backends/handler/torch_scripted_handler.hh).
 ### Custom Handler
@@ -89,11 +87,11 @@ python -c "import ts; from pathlib import Path; print((Path(ts.__file__).parent
 3. Make sure you have the right conda/venv environment activated during building that you're also using to run TorchServe.
 
 Q: Build on Mac fails with `Library not loaded: @rpath/libomp.dylib`
-A: Install libomp with brew and link in /usr/local/lib 
+A: Install libomp with brew and link in /usr/local/lib
 ```bash
 brew install libomp
 sudo ln -s /opt/homebrew/opt/libomp/lib/libomp.dylib /usr/local/lib/libomp.dylib
 ```
 
 Q: When loading a handler which uses a model exported with torch._export.aot_compile the handler dies with "error: Error in dlopen: MODEL.SO : undefined symbol: SOME_SYMBOL".
-A: Make sure that you are using matching libtorch and Pytorch versions for inference and export, respectively.
+A: Make sure that you are using matching libtorch and Pytorch versions for inference and export, respectively.
@@ -28,14 +28,9 @@ function install_folly() {
     echo -e "${COLOR_GREEN}[ INFO ] Building Folly ${COLOR_OFF}"
     cd $FOLLY_SRC_DIR
 
-    if [ "$PLATFORM" = "Linux" ]; then
-      SUDO="sudo"
-    elif [ "$PLATFORM" = "Mac" ]; then
-      SUDO=""
-    fi
-    $SUDO ./build/fbcode_builder/getdeps.py install-system-deps --recursive
+    ./build/fbcode_builder/getdeps.py install-system-deps --recursive
 
-    $SUDO ./build/fbcode_builder/getdeps.py build \
+    ./build/fbcode_builder/getdeps.py build \
     --allow-system-packages \
     --scratch-path $FOLLY_BUILD_DIR \
     --extra-cmake-defines='{"CMAKE_CXX_FLAGS": "-fPIC -D_GLIBCXX_USE_CXX11_ABI=1"}'
@@ -47,36 +42,29 @@ function install_folly() {
   echo "$FOLLY_BUILD_DIR/installed"
 }
 
-function install_kineto() {
-  if [ "$PLATFORM" = "Linux" ]; then
-    echo -e "${COLOR_GREEN}[ INFO ] Skip install kineto on Linux ${COLOR_OFF}"
-  elif [ "$PLATFORM" = "Mac" ]; then
-    KINETO_SRC_DIR=$BASE_DIR/third-party/kineto
-
-    if [ ! -d "$KINETO_SRC_DIR/libkineto/build" ] ; then
-      cd $KINETO_SRC_DIR/libkineto
-      mkdir build && cd build
-      cmake ..
-      make install
-    fi
-  fi
-
-  cd "$BWD" || exit
-}
-
 function install_libtorch() {
+  cd "$DEPS_DIR" || exit
   TORCH_VERSION="2.2.1"
+  if [ -d "$DEPS_DIR/libtorch" ]; then
+    RAW_VERSION=`cat "$DEPS_DIR/libtorch/build-version"`
+    VERSION=`cat "$DEPS_DIR/libtorch/build-version" | cut -d "+" -f 1`
+    if [ "$USE_NIGHTLIES" = "true" ] && [[ ! "${RAW_VERSION}" =~ .*"dev".* ]]; then
+      rm -rf "$DEPS_DIR/libtorch"
+    elif [ "$USE_NIGHTLIES" == "" ] && [ "$VERSION" != "$TORCH_VERSION" ]; then
+      rm -rf "$DEPS_DIR/libtorch"
+    fi
+  fi
   if [ "$PLATFORM" = "Mac" ]; then
     if [ ! -d "$DEPS_DIR/libtorch" ]; then
       if [[ $(uname -m) == 'x86_64' ]]; then
         echo -e "${COLOR_GREEN}[ INFO ] Install libtorch on Mac x86_64 ${COLOR_OFF}"
-        wget https://download.pytorch.org/libtorch/cpu/libtorch-macos-x86_64-${TORCH_VERSION}.zip
-        unzip libtorch-macos-x86_64-${TORCH_VERSION}.zip
+        wget -q https://download.pytorch.org/libtorch/cpu/libtorch-macos-x86_64-${TORCH_VERSION}.zip
+        unzip -q libtorch-macos-x86_64-${TORCH_VERSION}.zip
         rm libtorch-macos-x86_64-${TORCH_VERSION}.zip
       else
         echo -e "${COLOR_GREEN}[ INFO ] Install libtorch on Mac arm64 ${COLOR_OFF}"
-        wget https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-${TORCH_VERSION}.zip
-        unzip libtorch-macos-arm64-${TORCH_VERSION}.zip
+        wget -q https://download.pytorch.org/libtorch/cpu/libtorch-macos-arm64-${TORCH_VERSION}.zip
+        unzip -q libtorch-macos-arm64-${TORCH_VERSION}.zip
         rm libtorch-macos-arm64-${TORCH_VERSION}.zip
       fi
     fi
@@ -86,27 +74,17 @@ function install_libtorch() {
       echo -e "${COLOR_RED}[ ERROR ] Unknown platform: $PLATFORM ${COLOR_OFF}"
       exit 1
   else  # Linux
-    if [ -d "$DEPS_DIR/libtorch" ]; then
-      RAW_VERSION=`cat "$DEPS_DIR/libtorch/build-version"`
-      VERSION=`cat "$DEPS_DIR/libtorch/build-version" | cut -d "+" -f 1`
-      if [ "$USE_NIGHTLIES" = "true" ] && [[ ! "${RAW_VERSION}" =~ .*"dev".* ]]; then
-        rm -rf "$DEPS_DIR/libtorch"
-      elif [ "$USE_NIGHTLIES" == "" ] && [ "$VERSION" != "$TORCH_VERSION" ]; then
-        rm -rf "$DEPS_DIR/libtorch"
-      fi
-    fi
     if [ ! -d "$DEPS_DIR/libtorch" ]; then
-      cd "$DEPS_DIR" || exit
       echo -e "${COLOR_GREEN}[ INFO ] Install libtorch on Linux ${COLOR_OFF}"
       if [ "$USE_NIGHTLIES" == true ]; then
         URL=https://download.pytorch.org/libtorch/nightly/${CUDA}/libtorch-cxx11-abi-shared-with-deps-latest.zip
       else
         URL=https://download.pytorch.org/libtorch/${CUDA}/libtorch-cxx11-abi-shared-with-deps-${TORCH_VERSION}%2B${CUDA}.zip
       fi
-      wget $URL
+      wget -q $URL
       ZIP_FILE=$(basename "$URL")
       ZIP_FILE="${ZIP_FILE//%2B/+}"
-      unzip $ZIP_FILE
+      unzip -q $ZIP_FILE
       rm $ZIP_FILE
     fi
     echo -e "${COLOR_GREEN}[ INFO ] libtorch is installed ${COLOR_OFF}"
@@ -115,58 +93,22 @@ function install_libtorch() {
   cd "$BWD" || exit
 }
 
-function install_yaml_cpp() {
-  YAML_CPP_SRC_DIR=$BASE_DIR/third-party/yaml-cpp
-  YAML_CPP_BUILD_DIR=$DEPS_DIR/yaml-cpp-build
-
-  if [ ! -d "$YAML_CPP_BUILD_DIR" ] ; then
-    echo -e "${COLOR_GREEN}[ INFO ] Building yaml-cpp ${COLOR_OFF}"
-
-    if [ "$PLATFORM" = "Linux" ]; then
-      SUDO="sudo"
-    elif [ "$PLATFORM" = "Mac" ]; then
-      SUDO=""
-    fi
-
-    mkdir $YAML_CPP_BUILD_DIR
-    cd $YAML_CPP_BUILD_DIR
-    cmake $YAML_CPP_SRC_DIR -DYAML_BUILD_SHARED_LIBS=ON -DYAML_CPP_BUILD_TESTS=OFF -DCMAKE_CXX_FLAGS="-fPIC"
-    $SUDO make install
-
-    echo -e "${COLOR_GREEN}[ INFO ] yaml-cpp is installed ${COLOR_OFF}"
-  fi
-
-  cd "$BWD" || exit
-}
-
-function build_llama_cpp() {
-  BWD=$(pwd)
-  LLAMA_CPP_SRC_DIR=$BASE_DIR/third-party/llama.cpp
-  cd "${LLAMA_CPP_SRC_DIR}"
-  if [ "$PLATFORM" = "Mac" ]; then
-    make LLAMA_METAL=OFF -j
-  else
-    make -j
-  fi
-  cd "$BWD" || exit
-}
-
 function prepare_test_files() {
   echo -e "${COLOR_GREEN}[ INFO ]Preparing test files ${COLOR_OFF}"
   local EX_DIR="${TR_DIR}/examples/"
   rsync -a --link-dest=../../test/resources/ ${BASE_DIR}/test/resources/ ${TR_DIR}/
   if [ ! -f "${EX_DIR}/babyllama/babyllama_handler/tokenizer.bin" ]; then
-    wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin -O "${EX_DIR}/babyllama/babyllama_handler/tokenizer.bin"
+    wget -q https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin -O "${EX_DIR}/babyllama/babyllama_handler/tokenizer.bin"
   fi
   if [ ! -f "${EX_DIR}/babyllama/babyllama_handler/stories15M.bin" ]; then
-    wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin -O "${EX_DIR}/babyllama/babyllama_handler/stories15M.bin"
+    wget -q https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin -O "${EX_DIR}/babyllama/babyllama_handler/stories15M.bin"
   fi
   # PT2.2 torch.expport does not support Mac
   if [ "$PLATFORM" = "Linux" ]; then
     if [ ! -f "${EX_DIR}/aot_inductor/llama_handler/stories15M.so" ]; then
       local HANDLER_DIR=${EX_DIR}/aot_inductor/llama_handler/
       if [ ! -f "${HANDLER_DIR}/stories15M.pt" ]; then
-        wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt?download=true -O "${HANDLER_DIR}/stories15M.pt"
+        wget -q https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt?download=true -O "${HANDLER_DIR}/stories15M.pt"
       fi
       local LLAMA_SO_DIR=${BASE_DIR}/third-party/llama2.so/
       PYTHONPATH=${LLAMA_SO_DIR}:${PYTHONPATH} python ${BASE_DIR}/../examples/cpp/aot_inductor/llama2/compile.py --checkpoint ${HANDLER_DIR}/stories15M.pt ${HANDLER_DIR}/stories15M.so
@@ -221,12 +163,11 @@ function build() {
 
   # Build torchserve_cpp with cmake
   cd "$BWD" || exit
-  YAML_CPP_CMAKE_DIR=$DEPS_DIR/yaml-cpp-build
   FOLLY_CMAKE_DIR=$DEPS_DIR/folly-build/installed
   find $FOLLY_CMAKE_DIR -name "lib*.*"  -exec ln -s "{}" $LIBS_DIR/ \;
   if [ "$PLATFORM" = "Linux" ]; then
     cmake                                                                                     \
-    -DCMAKE_PREFIX_PATH="$DEPS_DIR;$FOLLY_CMAKE_DIR;$YAML_CPP_CMAKE_DIR;$DEPS_DIR/libtorch"                       \
+    -DCMAKE_PREFIX_PATH="$DEPS_DIR;$FOLLY_CMAKE_DIR;$DEPS_DIR/libtorch"                       \
     -DCMAKE_INSTALL_PREFIX="$PREFIX"                                                          \
     "$MAYBE_BUILD_QUIC"                                                                       \
     "$MAYBE_BUILD_TESTS"                                                                      \
@@ -242,8 +183,10 @@ function build() {
       export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/bin/nvcc
     fi
   elif [ "$PLATFORM" = "Mac" ]; then
+    export LIBRARY_PATH=${LIBRARY_PATH}:`brew --prefix icu4c`/lib:`brew --prefix libomp`/lib
+
     cmake                                                                                     \
-    -DCMAKE_PREFIX_PATH="$DEPS_DIR;$FOLLY_CMAKE_DIR;$YAML_CPP_CMAKE_DIR;$DEPS_DIR/libtorch"    \
+    -DCMAKE_PREFIX_PATH="$DEPS_DIR;$FOLLY_CMAKE_DIR;$DEPS_DIR/libtorch"                       \
     -DCMAKE_INSTALL_PREFIX="$PREFIX"                                                          \
     "$MAYBE_BUILD_QUIC"                                                                       \
     "$MAYBE_BUILD_TESTS"                                                                      \
@@ -252,9 +195,10 @@ function build() {
     "$MAYBE_USE_STATIC_DEPS"                                                                  \
     "$MAYBE_LIB_FUZZING_ENGINE"                                                               \
     "$MAYBE_NIGHTLIES"                                                                        \
+    "-DLLAMA_METAL=OFF"                                                                       \
     ..
 
-    export LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/opt/icu4c/lib
+
   else
     # TODO: Windows
     echo -e "${COLOR_RED}[ ERROR ] Unknown platform: $PLATFORM ${COLOR_OFF}"
@@ -282,16 +226,8 @@ function symlink_torch_libs() {
   fi
 }
 
-function symlink_yaml_cpp_lib() {
-  if [ "$PLATFORM" = "Linux" ]; then
-    ln -sf ${DEPS_DIR}/yaml-cpp-build/*.so* ${LIBS_DIR}
-  elif [ "$PLATFORM" = "Mac" ]; then
-    ln -sf ${DEPS_DIR}/yaml-cpp-build/*.dylib* ${LIBS_DIR}
-  fi
-}
-
 function install_torchserve_cpp() {
-  TARGET_DIR=$BASE_DIR/../ts/cpp/
+  TARGET_DIR=`python -c "import ts; from pathlib import Path; print(Path(ts.__file__).parent / 'cpp')"`
 
   if [ -d $TARGET_DIR ]; then
     rm -rf $TARGET_DIR
@@ -370,12 +306,8 @@ cd $BASE_DIR
 git submodule update --init --recursive
 
 install_folly
-#install_kineto
 install_libtorch
-install_yaml_cpp
-build_llama_cpp
 prepare_test_files
 build
 symlink_torch_libs
-symlink_yaml_cpp_lib
 install_torchserve_cpp
@@ -297,6 +297,7 @@ e.g. : To allow base URLs `https://s3.amazonaws.com/` and `https://torchserve.py
   * For security reason, `use_env_allowed_urls=true` is required in config.properties to read `allowed_urls` from environment variable.
 * `workflow_store` : Path of workflow store directory. Defaults to model store directory.
 * `disable_system_metrics` : Disable collection of system metrics when set to "true". Default value is "false".
+* `system_metrics_cmd`: The customized system metrics python script name with arguments. For example:`ts/metrics/metric_collector.py --gpu 0`. Default: empty which means TorchServe collects system metrics via "ts/metrics/metric_collector.py --gpu $CUDA_VISIBLE_DEVICES".
 
 **NOTE**
 
 
@@ -1,8 +1,12 @@
 # TorchServe token authorization API
 
+## Setup
+1. Download the jar files from [Maven](https://mvnrepository.com/artifact/org.pytorch/torchserve-endpoint-plugin) 
+2. Enable token authorization by adding the `--plugins-path /path/to/the/jar/files` flag at start up with the path leading to the downloaded jar files.
+
 ## Configuration
-1. Enable token authorization by adding the provided plugin at start using the `--plugins-path` command.
-2. Torchserve will enable token authorization if the plugin is provided. In the current working directory a file `key_file.json` will be generated.
+1. Torchserve will enable token authorization if the plugin is provided. Expected log statement `[INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager - Loading plugin for endpoint token`
+2. In the current working directory a file `key_file.json` will be generated.
     1. Example key file:
 
 ```python
 
@@ -1,20 +1,20 @@
 set(LLAMACPP_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/third-party/llama.cpp")
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 add_library(llamacpp_handler SHARED src/llamacpp_handler.cc)
 
-set(MY_OBJECT_FILES
-    ${LLAMACPP_SRC_DIR}/ggml.o
-    ${LLAMACPP_SRC_DIR}/llama.o
-    ${LLAMACPP_SRC_DIR}/common.o
-    ${LLAMACPP_SRC_DIR}/ggml-quants.o
-    ${LLAMACPP_SRC_DIR}/ggml-alloc.o
-    ${LLAMACPP_SRC_DIR}/grammar-parser.o
-    ${LLAMACPP_SRC_DIR}/console.o
-    ${LLAMACPP_SRC_DIR}/build-info.o
-    ${LLAMACPP_SRC_DIR}/ggml-backend.o
-
+FetchContent_Declare(
+  llama.cpp
+  GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+  GIT_TAG b2241
 )
+FetchContent_GetProperties(llama.cpp)
+
+if(NOT llama.cpp_POPULATED)
+  message(STATUS "Fetching llama.cpp...")
+  FetchContent_Populate(llama.cpp)
+  add_subdirectory(${llama.cpp_SOURCE_DIR} ${llama.cpp_BINARY_DIR})
+endif()
 
-target_sources(llamacpp_handler PRIVATE ${MY_OBJECT_FILES})
 target_include_directories(llamacpp_handler PUBLIC ${LLAMACPP_SRC_DIR})
-target_link_libraries(llamacpp_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES})
+target_link_libraries(llamacpp_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES} common llama)
@@ -44,7 +44,7 @@ LlamaCppHandler::LoadModel(
     params.main_gpu = 0;
     params.n_gpu_layers = 35;
 
-    llama_backend_init(params.numa);
+    llama_backend_init();
     ctx_params = llama_context_default_params();
     model_params = llama_model_default_params();
     llamamodel = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -7,6 +7,7 @@
 import java.io.OutputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 import org.apache.commons.io.IOUtils;
@@ -32,16 +33,23 @@ public MetricCollector(ConfigManager configManager) {
     public void run() {
         try {
             // Collect System level Metrics
-            String[] args = new String[4];
-            args[0] = configManager.getPythonExecutable();
-            args[1] = "ts/metrics/metric_collector.py";
-            args[2] = "--gpu";
-            args[3] = String.valueOf(ConfigManager.getInstance().getNumberOfGpu());
+            List<String> args = new ArrayList<>();
+            args.add(configManager.getPythonExecutable());
+            String systemMetricsCmd = configManager.getSystemMetricsCmd();
+            if (systemMetricsCmd.isEmpty()) {
+                systemMetricsCmd =
+                        String.format(
+                                "%s --gpu %s",
+                                "ts/metrics/metric_collector.py",
+                                String.valueOf(configManager.getNumberOfGpu()));
+            }
+            args.addAll(Arrays.asList(systemMetricsCmd.split("\\s+")));
             File workingDir = new File(configManager.getModelServerHome());
 
             String[] envp = EnvironmentUtils.getEnvString(workingDir.getAbsolutePath(), null, null);
-
-            final Process p = Runtime.getRuntime().exec(args, envp, workingDir); // NOPMD
+            final Process p =
+                    Runtime.getRuntime()
+                            .exec(args.toArray(new String[0]), envp, workingDir); // NOPMD
             ModelManager modelManager = ModelManager.getInstance();
             Map<Integer, WorkerThread> workerMap = modelManager.getWorkers();
             try (OutputStream os = p.getOutputStream()) {
 
@@ -117,6 +117,7 @@ public final class ConfigManager {
     private static final String MODEL_SNAPSHOT = "model_snapshot";
     private static final String MODEL_CONFIG = "models";
     private static final String VERSION = "version";
+    private static final String SYSTEM_METRICS_CMD = "system_metrics_cmd";
 
     // Configuration default values
     private static final String DEFAULT_TS_ALLOWED_URLS = "file://.*|http(s)?://.*";
@@ -559,6 +560,10 @@ public String getCertificateFile() {
         return prop.getProperty(TS_CERTIFICATE_FILE);
     }
 
+    public String getSystemMetricsCmd() {
+        return prop.getProperty(SYSTEM_METRICS_CMD, "");
+    }
+
     public SslContext getSslContext() throws IOException, GeneralSecurityException {
         List<String> supportedCiphers =
                 Arrays.asList(
@@ -734,7 +739,9 @@ public String dumpConfigurations() {
                 + "\nCPP log config: "
                 + (getTsCppLogConfig() == null ? "N/A" : getTsCppLogConfig())
                 + "\nModel config: "
-                + prop.getProperty(MODEL_CONFIG, "N/A");
+                + prop.getProperty(MODEL_CONFIG, "N/A")
+                + "\nSystem metrics command: "
+                + (getSystemMetricsCmd().isEmpty() ? "default" : getSystemMetricsCmd());
     }
 
     public boolean useNativeIo() {
 
@@ -28,52 +28,42 @@ def parse_config():
         model_store: the path in which the .mar file resides
     """
     separator = "="
-    keys = {}
+    ts_configuration = {}
     config_path = os.environ.get("CONFIG_PATH", DEFAULT_CONFIG_PATH)
 
     logging.info(f"Wrapper: loading configuration from {config_path}")
 
     with open(config_path) as f:
         for line in f:
-            if separator in line:
-                # Find the name and value by splitting the string
-                name, value = line.split(separator, 1)
-
-                # Assign key value pair to dict
-                # strip() removes white space from the ends of strings
-                keys[name.strip()] = value.strip()
-
-    keys["model_snapshot"] = json.loads(keys["model_snapshot"])
-    inference_address, management_address, grpc_inference_port, model_store = (
-        keys["inference_address"],
-        keys["management_address"],
-        keys["grpc_inference_port"],
-        keys["model_store"],
+            if not line.startswith("#"):
+                if separator in line:
+                    name, value = line.split(separator, 1)
+                    ts_configuration[name.strip()] = value.strip()
+
+    ts_configuration["model_snapshot"] = json.loads(
+        ts_configuration.get("model_snapshot", "{}")
     )
 
-    models = keys["model_snapshot"]["models"]
-    model_names = []
+    inference_address = ts_configuration.get(
+        "inference_address", DEFAULT_INFERENCE_ADDRESS
+    )
+    management_address = ts_configuration.get(
+        "management_address", DEFAULT_MANAGEMENT_ADDRESS
+    )
+    grpc_inference_port = ts_configuration.get(
+        "grpc_inference_port", DEFAULT_GRPC_INFERENCE_PORT
+    )
+    model_store = ts_configuration.get("model_store", DEFAULT_MODEL_STORE)
 
     # Get all the model_names
-    for model, value in models.items():
-        model_names.append(model)
+    model_names = ts_configuration["model_snapshot"].get("models", {}).keys()
 
-    if not inference_address:
-        inference_address = DEFAULT_INFERENCE_ADDRESS
     if not model_names:
         model_names = [DEFAULT_MODEL_NAME]
-    if not inference_address:
-        inference_address = DEFAULT_INFERENCE_ADDRESS
-    if not management_address:
-        management_address = DEFAULT_MANAGEMENT_ADDRESS
+
     inf_splits = inference_address.split(":")
-    if not grpc_inference_port:
-        grpc_inference_address = inf_splits[1] + ":" + DEFAULT_GRPC_INFERENCE_PORT
-    else:
-        grpc_inference_address = inf_splits[1] + ":" + grpc_inference_port
+    grpc_inference_address = inf_splits[1] + ":" + grpc_inference_port
     grpc_inference_address = grpc_inference_address.replace("/", "")
-    if not model_store:
-        model_store = DEFAULT_MODEL_STORE
 
     logging.info(
         "Wrapper : Model names %s, inference address %s, management address %s, grpc_inference_address, %s, model store %s",
 
@@ -64,7 +64,9 @@
     "xz",
     "openssl",
     "libsodium",
-    "llv",
+    "icu4c",
+    "libomp",
+    "llvm",
 )
 
 CPP_DARWIN_DEPENDENCIES_LINK = (
@@ -286,13 +288,13 @@ def install_cpp_dependencies(self):
             os.system(f"brew install -f {' '.join(CPP_DARWIN_DEPENDENCIES)}")
             os.system(f"brew link {' '.join(CPP_DARWIN_DEPENDENCIES_LINK)}")
             os.system(
-                'ln -s "$(brew --prefix llvm)/bin/clang-format" "/usr/local/bin/clang-format"'
+                f'{self.sudo_cmd} ln -s "$(brew --prefix llvm)/bin/clang-format" "/usr/local/bin/clang-format"'
             )
             os.system(
-                'ln -s "$(brew --prefix llvm)/bin/clang-tidy" "/usr/local/bin/clang-tidy"'
+                f'{self.sudo_cmd} ln -s "$(brew --prefix llvm)/bin/clang-tidy" "/usr/local/bin/clang-tidy"'
             )
             os.system(
-                'ln -s "$(brew --prefix llvm)/bin/clang-apply-replacements" "/usr/local/bin/clang-apply-replacements"'
+                f'{self.sudo_cmd} ln -s "$(brew --prefix llvm)/bin/clang-apply-replacements" "/usr/local/bin/clang-apply-replacements"'
             )
 
     def install_neuronx_driver(self):
 
@@ -43,6 +43,8 @@
 
 npm_env = {"npm_pkg_version": []}
 
+cpp_env = {"LIBRARY_PATH": ""}
+
 
 def get_nvidia_smi():
     # Note: nvidia-smi is currently available only on Windows and Linux
@@ -284,6 +286,16 @@ def get_torch_model_archiver():
     return version
 
 
+def get_library_path():
+    platform = get_platform()
+    if platform == "darwin":
+        return os.environ.get("DYLD_LIBRARY_PATH", "")
+    elif platform == "linux":
+        return os.environ.get("LD_LIBRARY_PATH", "")
+    else:
+        return ""
+
+
 def populate_torchserve_env(torch_pkg):
     for pkg in torch_pkg:
         if pkg.split("==")[0] == "torch":
@@ -338,6 +350,10 @@ def populate_npm_env():
     npm_env["npm_pkg_version"] = get_npm_packages()
 
 
+def populate_cpp_env():
+    cpp_env["LIBRARY_PATH"] = get_library_path()
+
+
 def populate_env_info():
     # torchserve packages
     _, torch_list_output = get_pip_packages("torch")
@@ -361,6 +377,9 @@ def populate_env_info():
     if get_platform() == "darwin":
         populate_npm_env()
 
+    if get_platform() in ("darwin", "linux"):
+        populate_cpp_env()
+
 
 env_info_fmt = """
 ------------------------------------------------------------------------------------------
@@ -403,18 +422,25 @@ def populate_env_info():
 {npm_pkg_version}
 """
 
+cpp_env_info_fmt = """
+Environment:
+library_path (LD_/DYLD_): {LIBRARY_PATH}
+"""
+
 
 def get_pretty_env_info(branch_name):
     global env_info_fmt
     global cuda_info_fmt
     global npm_info_fmt
+    global cpp_env_info_fmt
     populate_env_info()
     env_dict = {
         **torchserve_env,
         **python_env,
         **java_env,
         **os_info,
         "torchserve_branch": branch_name,
+        **cpp_env,
     }
 
     if TORCH_AVAILABLE and torch.cuda.is_available():
@@ -425,6 +451,9 @@ def get_pretty_env_info(branch_name):
         env_dict.update(npm_env)
         env_info_fmt = env_info_fmt + "\n" + npm_info_fmt
 
+    if get_platform() in ("darwin", "linux"):
+        env_info_fmt = env_info_fmt + "\n" + cpp_env_info_fmt
+
     return env_info_fmt.format(**env_dict)
 
 
 
@@ -1,3 +1,4 @@
+import asyncio
 import glob
 import json
 import os
@@ -18,17 +19,57 @@
 )
 
 
-def run_markdown_link_checker():
-    print("## Started markdown link checker")
-    result = True
-    for mdfile in glob.glob("**/*.md", recursive=True):
+async def markdown_link_checker(in_queue, out_queue, n):
+    print(f"worker started {n}")
+    while True:
+        mdfile = await in_queue.get()
+        output = []
+        result = True
         cmd = f"markdown-link-check {mdfile} --config link_check_config.json"
-        print(f"## In directory: {os.getcwd()} | Executing command: {cmd}")
-        status = os.system(cmd)
+        output.append(f"## In directory: {os.getcwd()} | Executing command: {cmd}")
+        p = await asyncio.create_subprocess_shell(
+            cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+        )
+        while not p.stdout.at_eof():
+            line = await p.stdout.readline()
+            output.append(line.decode("utf-8"))
+
+        status = await p.wait()
         if status != 0:
-            print(f"## Broken links in file: {mdfile}")
+            output.append(f"## Broken links in file: {mdfile}")
             result = False
-    return result
+        await out_queue.put((result, output))
+
+
+async def run_markdown_link_checker_on_files(files):
+    results = []
+    tasks = []
+    in_queue = asyncio.Queue()
+    out_queue = asyncio.Queue()
+    for f in files:
+        in_queue.put_nowait(f)
+
+    for n in range(16):
+        tasks.append(asyncio.create_task(markdown_link_checker(in_queue, out_queue, n)))
+
+    while len(results) != len(files):
+        print(len(results))
+        r, output = await out_queue.get()
+        results.append(r)
+        for line in output:
+            print(line)
+
+    for t in tasks:
+        t.cancel()
+
+    return results
+
+
+def run_markdown_link_checker():
+    print("## Started markdown link checker")
+    files = glob.glob("**/*.md", recursive=True)
+    results = asyncio.run(run_markdown_link_checker_on_files(files))
+    return all(results)
 
 
 def validate_model_on_gpu():