Merge branch 'master' into dependabot/pip/requirements/requests-2.32.0

agunapal · web-flow · commit f433a66aa598 · 2024-05-21T15:13:18.000-07:00
diff --git a/.github/workflows/regression_tests_cpu_binaries.yml b/.github/workflows/regression_tests_cpu_binaries.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, macOS-latest]
+        os: [ubuntu-20.04, macos-latest]
         python-version: ["3.8", "3.9", "3.10"]
         binaries: ["pypi", "conda"]
         exclude:
@@ -31,38 +31,33 @@ jobs:
         with:
           submodules: recursive
       - name: Setup conda with Python ${{ matrix.python-version }}
-        if: matrix.os == 'macos-14'
         uses: conda-incubator/setup-miniconda@v3
         with:
           auto-update-conda: true
           channels: anaconda, conda-forge
           python-version: ${{ matrix.python-version }}
-      - name: Setup conda with Python ${{ matrix.python-version }}
-        if: matrix.os != 'macos-14'
-        uses: s-weigand/setup-conda@v1
-        with:
-          update-conda: true
-          python-version: ${{ matrix.python-version }}
-          conda-channels: anaconda, conda-forge
       - name: Setup Java 17
         uses: actions/setup-java@v3
         with:
           distribution: 'zulu'
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
-      - name: Run install dependencies and regression test
-        if: matrix.os == 'macos-14'
-        shell: bash -el {0}
-        run: |
-          conda info
-          python ts_scripts/install_dependencies.py --environment=dev
-          python test/regression_tests.py --binaries --${{ matrix.binaries }} --nightly
       - name: Install dependencies
-        if: matrix.os != 'macos-14'
+        shell: bash -el {0}
         run: |
+          echo "=====CHECK ENV AND PYTHON VERSION===="
+          conda info --envs
+          python --version
+          echo "=====RUN INSTALL DEPENDENCIES===="
           python ts_scripts/install_dependencies.py --environment=dev
-      - name: Validate Torchserve CPU Regression
-        if: matrix.os != 'macos-14'
+      - name: Torchserve Regression Tests
+        shell: bash -el {0}
+        env:
+          TS_MAC_ARM64_CPU_ONLY: ${{ matrix.os == 'macos-latest' && 'True' || 'False' }}
         run: |
+          echo "=====CHECK ENV AND PYTHON VERSION===="
+          conda info --envs
+          python --version
+          echo "=====RUN REGRESSION TESTS===="
           python test/regression_tests.py --binaries --${{ matrix.binaries }} --nightly
diff --git a/.github/workflows/regression_tests_gpu_binaries.yml b/.github/workflows/regression_tests_gpu_binaries.yml
@@ -39,12 +39,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
           architecture: x64
-      - name: Setup Conda
-        uses: s-weigand/setup-conda@v1
-        with:
-          update-conda: true
-          python-version: ${{ matrix.python-version }}
-          conda-channels: anaconda, conda-forge
+      - run: python --version
       - run: conda --version
       - name: Setup Java 17
         uses: actions/setup-java@v3
@@ -53,17 +48,17 @@ jobs:
           java-version: '17'
       - name: Install dependencies
         shell: bash -el {0}
-        run: | 
+        run: |
           echo "=====CHECK ENV AND PYTHON VERSION===="
           /home/ubuntu/actions-runner/_work/serve/serve/3/condabin/conda info --envs
           python --version
           echo "=====RUN INSTALL DEPENDENCIES===="
           python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121
       - name: Torchserve Regression Tests
-        shell: bash -el {0}  
+        shell: bash -el {0}
         run: |
           echo "=====CHECK ENV AND PYTHON VERSION===="
           /home/ubuntu/actions-runner/_work/serve/serve/3/condabin/conda info --envs
           python --version
           echo "=====RUN REGRESSION TESTS===="
-          python test/regression_tests.py --binaries --${{ matrix.binaries }} --nightly
+          python test/regression_tests.py --binaries --${{ matrix.binaries }} --nightly
diff --git a/examples/large_models/gpt_fast_mixtral_moe/README.md b/examples/large_models/gpt_fast_mixtral_moe/README.md
@@ -0,0 +1,96 @@
+
+## Mixtral-MOE
+
+We will be using [Mixtral-MOE](https://huggingface.co/docs/transformers/en/model_doc/mixtral).
+
+It features:
+* 8 experts per MLP
+* 45 billion parameters
+* compute required is the same as that of a 14 billion parameter model
+* Sliding Window Attention
+* GQA
+* Byte-fallback BPE tokenizer
+
+As a low-level framework we will be using [GPT fast](https://github.com/pytorch-labs/gpt-fast).
+
+
+
+#### Pre-requisites
+
+- PyTorch 2.3
+- CUDA >= 11.8
+
+`cd` to the example folder `examples/large_models/gpt_fast_mixtral_moe`
+
+Install dependencies
+```
+git clone https://github.com/pytorch-labs/gpt-fast/
+pip install sentencepiece huggingface_hub
+```
+
+### Step 1: Download  and convert the weights
+
+Currently supported models:
+```
+mistralai/Mixtral-8x7B-v0.1
+```
+Prepare weights:
+```
+export MODEL_REPO=mistralai/Mixtral-8x7B-v0.1
+huggingface-cli login
+python gpt-fast/mixtral-moe/scripts/download.py --repo_id $MODEL_REPO
+python gpt-fast/mixtral-moe/scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$MODEL_REPO
+```
+
+### Step 1.5: Quantize the model to int8
+
+To speed up model loading and inference even further we can optionally quantize the model to int8. Please see the [blog post](https://pytorch.org/blog/accelerating-generative-ai-2/) for details on the potential accuracy loss.
+
+```
+python gpt-fast/mixtral-moe/quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int8
+```
+
+The quantized model will show up as checkpoints/$MODEL_REPO/model_int8.pth.
+
+After that we will be using quantized version because of lower memory requirements, but you are free to use original model. To enable it in the example you need to exchange the filename in the [`model_config.yaml`](./model_config.yaml) file.
+
+
+### Step 2: Generate model archive
+At this stage we're creating the model archive which includes the configuration of our model in [model_config.yaml](./model_config.yaml).
+It's also the point where we need to decide if we want to deploy our model on a single or multiple GPUs.
+For the single GPU case we can use the default configuration that can be found in [model_config.yaml](./model_config.yaml).
+All configs enable the current prototyping feature FxGraphCache by setting fx_graph_cache to *true*.
+This feature stores the TorchInductor output in a cache to speed up torch.compile times when rerunning the handler.
+
+Please proceed with [TorchServe instalation](https://github.com/pytorch/serve/blob/master/README.md) in order to have torch-model-archiver.
+
+```
+torch-model-archiver --model-name gpt_fast_mixtral_moe --version 1.0 --handler ../gpt_fast/handler.py --config-file model_config.yaml --extra-files "gpt-fast/mixtral-moe/generate.py,gpt-fast/mixtral-moe/model.py,gpt-fast/mixtral-moe/quantize.py,gpt-fast/mixtral-moe/tp.py" --archive-format no-archive
+mv checkpoints gpt_fast_mixtral_moe/
+```
+
+If we want to use tensor parallel variant and split the model over multiple GPUs we need to set the grade of desired tensor parallelism in [model_config_tp.yaml](./model_config_tp.yaml) and use this configuration for creating the archive:
+```
+torch-model-archiver --model-name gpt_fast_mixtral_moe --version 1.0 --handler ../gpt_fast/handler.py --config-file model_config_tp.yaml --extra-files "gpt-fast/mixtral-moe/generate.py,gpt-fast/mixtral-moe/model.py,gpt-fast/mixtral-moe/quantize.py,gpt-fast/mixtral-moe/tp.py" --archive-format no-archive
+mv checkpoints gpt_fast_mixtral_moe/
+```
+
+### Step 3: Add the model archive to model store
+
+```
+mkdir model_store
+mv gpt_fast_mixtral_moe model_store
+```
+
+### Step 4: Start torchserve
+
+```
+torchserve --start --ncs --model-store model_store --models gpt_fast_mixtral_moe
+```
+
+### Step 5: Run inference
+
+```
+curl "http://localhost:8080/predictions/gpt_fast_mixtral_moe" -T request.json
+# Returns: Paris, is one of the most visited cities in the world. It is a city of romance, art, culture, and fashion. Paris is home to some of the most iconic landmarks in the world, including the Eiffel Tower
+```
diff --git a/examples/large_models/gpt_fast_mixtral_moe/model_config.yaml b/examples/large_models/gpt_fast_mixtral_moe/model_config.yaml
@@ -0,0 +1,11 @@
+#frontend settings
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 200
+responseTimeout: 300
+deviceType: "gpu"
+handler:
+    converted_ckpt_dir: "checkpoints/mistralai/Mixtral-8x7B-v0.1/model_int8.pth"
+    max_new_tokens: 50
+    compile: true
+    fx_graph_cache: True
diff --git a/examples/large_models/gpt_fast_mixtral_moe/model_config_tp.yaml b/examples/large_models/gpt_fast_mixtral_moe/model_config_tp.yaml
@@ -0,0 +1,16 @@
+#frontend settings
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 200
+responseTimeout: 300
+parallelType: "tp"
+deviceType: "gpu"
+torchrun:
+    nproc-per-node: 4
+handler:
+    profile: true
+    converted_ckpt_dir: "checkpoints/mistralai/Mixtral-8x7B-v0.1/model_int8.pth"
+    max_new_tokens: 50
+    compile: true
+    stream: false
+    fx_graph_cache: True
diff --git a/examples/large_models/gpt_fast_mixtral_moe/request.json b/examples/large_models/gpt_fast_mixtral_moe/request.json
@@ -0,0 +1,4 @@
+{
+    "prompt": "The capital of France",
+    "max_new_tokens": 50
+}
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -5,3 +5,4 @@ packaging==23.2
 pynvml==11.5.0
 pyyaml==6.0
 ninja==1.11.1.1
+setuptools==69.5.1
diff --git a/test/pytest/test_continuous_batching.py b/test/pytest/test_continuous_batching.py
@@ -94,6 +94,7 @@ def register_model(mar_file_path, model_store, torchserve):
     test_utils.unregister_model(model_name)
 
 
+@pytest.mark.skip(reason="Skipping this test for now")
 def test_echo_stream_inference(model_name_and_stdout):
     model_name, _ = model_name_and_stdout
     responses = []
@@ -145,6 +146,7 @@ def test_echo_stream_inference(model_name_and_stdout):
     assert all_predictions[3] == "When travelling to NYC, I was able to"
 
 
+@pytest.mark.skip(reason="Skipping this test for now")
 def test_decoding_stage(monkeypatch):
     monkeypatch.syspath_prepend((CURR_FILE_PATH / "test_data" / "streaming"))
 
@@ -211,6 +213,7 @@ def test_decoding_stage(monkeypatch):
     assert ctx.cache["id2"]["encoded"]["attention_mask"].size()[-1] == 11
 
 
+@pytest.mark.skip(reason="Skipping this test for now")
 def test_closed_connection(model_name_and_stdout):
     model_name, stdout = model_name_and_stdout
 
diff --git a/ts_scripts/install_dependencies.py b/ts_scripts/install_dependencies.py
@@ -146,7 +146,6 @@ def install_python_packages(self, cuda_version, requirements_file_path, nightly)
         else:
             self.install_torch_packages(cuda_version)
 
-        os.system(f"{sys.executable} -m pip install -U pip setuptools")
         # developer.txt also installs packages from common.txt
         os.system(f"{sys.executable} -m pip install -U -r {requirements_file_path}")