Fix CI and Regression workflows for MAC Arm64 (#3128)

namannandan · web-flow · commit 087e813a7825 · 2024-05-09T17:45:38.000Z
* Update ci and regression cpu workflow for MAC

* Skip MPS tests when not running only on M1 CPU

* Fix test parametrization

* Fix env variable config to skip MPS tests

* Update workflow files

* Upgrade nick-fields/retry to v3

* Fix test import error

* Fix Mac M1 CPU only tests

* Fix env variable comparison

* Fix Mac M1 regression test
diff --git a/.github/workflows/ci_cpu.yml b/.github/workflows/ci_cpu.yml
@@ -21,18 +21,19 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, macOS-latest]
+        os: [ubuntu-20.04, macos-latest]
     steps:
       - name: Setup Python for M1
-        if: matrix.os == 'macos-14'
+        if: matrix.os == 'macos-latest'
         uses: actions/setup-python@v5
         with:
           python-version: '3.10'
+          architecture: arm64
       - name: Setup Python for all other OS
-        if: matrix.os != 'macos-14'
+        if: matrix.os != 'macos-latest'
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: '3.9'
           architecture: x64
       - name: Setup Java 17
         uses: actions/setup-java@v3
@@ -47,7 +48,9 @@ jobs:
         run: |
           python ts_scripts/install_dependencies.py --environment=dev
       - name: Torchserve Sanity
-        uses: nick-fields/retry@v2
+        env:
+          TS_MAC_ARM64_CPU_ONLY: ${{ matrix.os == 'macos-latest' && 'True' || 'False' }}
+        uses: nick-fields/retry@v3
         with:
           timeout_minutes: 60
           max_attempts: 3
diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
@@ -45,7 +45,7 @@ jobs:
         run: |
           python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121
       - name: Torchserve Sanity
-        uses: nick-fields/retry@v2
+        uses: nick-fields/retry@v3
         with:
           timeout_minutes: 60
           retry_on: error
diff --git a/.github/workflows/regression_tests_cpu.yml b/.github/workflows/regression_tests_cpu.yml
@@ -15,23 +15,24 @@ concurrency:
 
 jobs:
   regression-cpu:
-    # creates workflows for OS: ubuntu, macOS, macOS M1
+    # creates workflows for OS: ubuntu, macOS M1
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, macOS-latest]
+        os: [ubuntu-20.04, macos-latest]
     steps:
       - name: Setup Python for M1
-        if: matrix.os == 'macos-14'
+        if: matrix.os == 'macos-latest'
         uses: actions/setup-python@v5
         with:
           python-version: '3.10'
+          architecture: arm64
       - name: Setup Python for all other OS
-        if: matrix.os != 'macos-14'
+        if: matrix.os != 'macos-latest'
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: '3.9'
           architecture: x64
       - name: Setup Java 17
         uses: actions/setup-java@v3
@@ -46,5 +47,7 @@ jobs:
         run: |
           python ts_scripts/install_dependencies.py --environment=dev
       - name: Torchserve Regression Tests
+        env:
+          TS_MAC_ARM64_CPU_ONLY: ${{ matrix.os == 'macos-latest' && 'True' || 'False' }}
         run: |
           python test/regression_tests.py
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java b/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java
@@ -875,7 +875,8 @@ private static int getAvailableGpu() {
                         }
                     }
                 }
-                throw new AssertionError("Unexpected response.");
+                // No MPS devices detected
+                return 0;
             } else {
                 Process process =
                         Runtime.getRuntime().exec("nvidia-smi --query-gpu=index --format=csv");
diff --git a/frontend/server/src/test/java/org/pytorch/serve/util/ConfigManagerTest.java b/frontend/server/src/test/java/org/pytorch/serve/util/ConfigManagerTest.java
@@ -115,8 +115,13 @@ public void testNumGpuM1() throws ReflectiveOperationException, IOException {
         ConfigManager.init(args);
         ConfigManager configManager = ConfigManager.getInstance();
         String arch = System.getProperty("os.arch");
+        String mac_arm64_cpu_only = System.getenv().getOrDefault("TS_MAC_ARM64_CPU_ONLY", "False");
         if (arch.equals("aarch64")) {
-            Assert.assertTrue(configManager.getNumberOfGpu() > 0);
+            if (mac_arm64_cpu_only.equals("True")) {
+                Assert.assertEquals(configManager.getNumberOfGpu(), 0);
+            } else {
+                Assert.assertTrue(configManager.getNumberOfGpu() > 0);
+            }
         }
     }
 }
diff --git a/test/pytest/test_device_config.py b/test/pytest/test_device_config.py
@@ -19,6 +19,20 @@
 mnist_scriptes_py = os.path.join(REPO_ROOT, "examples/image_classifier/mnist/mnist.py")
 
 HANDLER_PY = """
+import torch
+from ts.torch_handler.base_handler import BaseHandler
+
+class deviceHandler(BaseHandler):
+
+    def initialize(self, context):
+        super().initialize(context)
+        if torch.backends.mps.is_available() and context.system_properties.get("gpu_id") is not None:
+            assert self.get_device().type == "mps"
+        else:
+            assert self.get_device().type == "cpu"
+"""
+
+HANDLER_PY_GPU = """
 from ts.torch_handler.base_handler import BaseHandler
 
 class deviceHandler(BaseHandler):
@@ -28,6 +42,16 @@ def initialize(self, context):
         assert self.get_device().type == "mps"
 """
 
+HANDLER_PY_CPU = """
+from ts.torch_handler.base_handler import BaseHandler
+
+class deviceHandler(BaseHandler):
+
+    def initialize(self, context):
+        super().initialize(context)
+        assert self.get_device().type == "cpu"
+"""
+
 MODEL_CONFIG_YAML = """
     #frontend settings
     # TorchServe frontend parameters
@@ -78,8 +102,23 @@ def get_config(param):
     return get_config(request.param)
 
 
+@pytest.fixture(scope="module")
+def handler_py(request):
+    def get_handler(param):
+        if param == "cpu":
+            return HANDLER_PY_CPU
+        elif param == "gpu":
+            return HANDLER_PY_GPU
+        else:
+            return HANDLER_PY
+
+    return get_handler(request.param)
+
+
 @pytest.fixture(scope="module", name="mar_file_path")
-def create_mar_file(work_dir, model_archiver, model_name, model_config_name):
+def create_mar_file(
+    work_dir, model_archiver, model_name, model_config_name, handler_py
+):
     mar_file_path = work_dir.joinpath(model_name + ".mar")
 
     model_config_yaml_file = work_dir / "model_config.yaml"
@@ -90,7 +129,7 @@ def create_mar_file(work_dir, model_archiver, model_name, model_config_name):
     model_py_file.write_text(mnist_scriptes_py)
 
     handler_py_file = work_dir / "handler.py"
-    handler_py_file.write_text(HANDLER_PY)
+    handler_py_file.write_text(handler_py)
 
     config = ModelArchiverConfig(
         model_name=model_name,
@@ -147,22 +186,29 @@ def register_model(mar_file_path, model_store, torchserve):
     test_utils.unregister_model(model_name)
 
 
-@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
+@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
+@pytest.mark.skipif(
+    os.environ.get("TS_MAC_ARM64_CPU_ONLY", "False") == "True",
+    reason="Skip if running only on MAC CPU",
+)
 @pytest.mark.parametrize("model_config_name", ["gpu"], indirect=True)
+@pytest.mark.parametrize("handler_py", ["gpu"], indirect=True)
 def test_m1_device(model_name, model_config_name):
     response = requests.get(f"http://localhost:8081/models/{model_name}")
     assert response.status_code == 200, "Describe Failed"
 
 
-@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
+@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
 @pytest.mark.parametrize("model_config_name", ["cpu"], indirect=True)
+@pytest.mark.parametrize("handler_py", ["cpu"], indirect=True)
 def test_m1_device_cpu(model_name, model_config_name):
     response = requests.get(f"http://localhost:8081/models/{model_name}")
-    assert response.status_code == 404, "Describe Worked"
+    assert response.status_code == 200, "Describe Failed"
 
 
-@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
+@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
 @pytest.mark.parametrize("model_config_name", ["default"], indirect=True)
+@pytest.mark.parametrize("handler_py", ["default"], indirect=True)
 def test_m1_device_default(model_name, model_config_name):
     response = requests.get(f"http://localhost:8081/models/{model_name}")
     assert response.status_code == 200, "Describe Failed"

Original file line number	Diff line number	Diff line change
`@@ -875,7 +875,8 @@ private static int getAvailableGpu() {`
`875`	`875`	`}`
`876`	`876`	`}`
`877`	`877`	`}`
`878`		`- throw new AssertionError("Unexpected response.");`
	`878`	`+ // No MPS devices detected`
	`879`	`+ return 0;`
`879`	`880`	`} else {`
`880`	`881`	`Process process =`
`881`	`882`	`Runtime.getRuntime().exec("nvidia-smi --query-gpu=index --format=csv");`
Original file line number	Diff line number	Diff line change
`@@ -115,8 +115,13 @@ public void testNumGpuM1() throws ReflectiveOperationException, IOException {`
`115`	`115`	`ConfigManager.init(args);`
`116`	`116`	`ConfigManager configManager = ConfigManager.getInstance();`
`117`	`117`	`String arch = System.getProperty("os.arch");`
	`118`	`+ String mac_arm64_cpu_only = System.getenv().getOrDefault("TS_MAC_ARM64_CPU_ONLY", "False");`
`118`	`119`	`if (arch.equals("aarch64")) {`
`119`		`- Assert.assertTrue(configManager.getNumberOfGpu() > 0);`
	`120`	`+ if (mac_arm64_cpu_only.equals("True")) {`
	`121`	`+ Assert.assertEquals(configManager.getNumberOfGpu(), 0);`
	`122`	`+ } else {`
	`123`	`+ Assert.assertTrue(configManager.getNumberOfGpu() > 0);`
	`124`	`+ }`
`120`	`125`	`}`
`121`	`126`	`}`
`122`	`127`	`}`