Skip to content

Commit 087e813

Browse files
authored
Fix CI and Regression workflows for MAC Arm64 (#3128)
* Update ci and regression cpu workflow for MAC * Skip MPS tests when not running only on M1 CPU * Fix test parametrization * Fix env variable config to skip MPS tests * Update workflow files * Upgrade nick-fields/retry to v3 * Fix test import error * Fix Mac M1 CPU only tests * Fix env variable comparison * Fix Mac M1 regression test
1 parent 0b4539f commit 087e813

File tree

6 files changed

+77
-19
lines changed

6 files changed

+77
-19
lines changed

.github/workflows/ci_cpu.yml

+8-5
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,19 @@ jobs:
2121
strategy:
2222
fail-fast: false
2323
matrix:
24-
os: [ubuntu-20.04, macOS-latest]
24+
os: [ubuntu-20.04, macos-latest]
2525
steps:
2626
- name: Setup Python for M1
27-
if: matrix.os == 'macos-14'
27+
if: matrix.os == 'macos-latest'
2828
uses: actions/setup-python@v5
2929
with:
3030
python-version: '3.10'
31+
architecture: arm64
3132
- name: Setup Python for all other OS
32-
if: matrix.os != 'macos-14'
33+
if: matrix.os != 'macos-latest'
3334
uses: actions/setup-python@v5
3435
with:
35-
python-version: 3.9
36+
python-version: '3.9'
3637
architecture: x64
3738
- name: Setup Java 17
3839
uses: actions/setup-java@v3
@@ -47,7 +48,9 @@ jobs:
4748
run: |
4849
python ts_scripts/install_dependencies.py --environment=dev
4950
- name: Torchserve Sanity
50-
uses: nick-fields/retry@v2
51+
env:
52+
TS_MAC_ARM64_CPU_ONLY: ${{ matrix.os == 'macos-latest' && 'True' || 'False' }}
53+
uses: nick-fields/retry@v3
5154
with:
5255
timeout_minutes: 60
5356
max_attempts: 3

.github/workflows/ci_gpu.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545
run: |
4646
python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121
4747
- name: Torchserve Sanity
48-
uses: nick-fields/retry@v2
48+
uses: nick-fields/retry@v3
4949
with:
5050
timeout_minutes: 60
5151
retry_on: error

.github/workflows/regression_tests_cpu.yml

+8-5
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,24 @@ concurrency:
1515

1616
jobs:
1717
regression-cpu:
18-
# creates workflows for OS: ubuntu, macOS, macOS M1
18+
# creates workflows for OS: ubuntu, macOS M1
1919
runs-on: ${{ matrix.os }}
2020
strategy:
2121
fail-fast: false
2222
matrix:
23-
os: [ubuntu-20.04, macOS-latest]
23+
os: [ubuntu-20.04, macos-latest]
2424
steps:
2525
- name: Setup Python for M1
26-
if: matrix.os == 'macos-14'
26+
if: matrix.os == 'macos-latest'
2727
uses: actions/setup-python@v5
2828
with:
2929
python-version: '3.10'
30+
architecture: arm64
3031
- name: Setup Python for all other OS
31-
if: matrix.os != 'macos-14'
32+
if: matrix.os != 'macos-latest'
3233
uses: actions/setup-python@v5
3334
with:
34-
python-version: 3.9
35+
python-version: '3.9'
3536
architecture: x64
3637
- name: Setup Java 17
3738
uses: actions/setup-java@v3
@@ -46,5 +47,7 @@ jobs:
4647
run: |
4748
python ts_scripts/install_dependencies.py --environment=dev
4849
- name: Torchserve Regression Tests
50+
env:
51+
TS_MAC_ARM64_CPU_ONLY: ${{ matrix.os == 'macos-latest' && 'True' || 'False' }}
4952
run: |
5053
python test/regression_tests.py

frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -875,7 +875,8 @@ private static int getAvailableGpu() {
875875
}
876876
}
877877
}
878-
throw new AssertionError("Unexpected response.");
878+
// No MPS devices detected
879+
return 0;
879880
} else {
880881
Process process =
881882
Runtime.getRuntime().exec("nvidia-smi --query-gpu=index --format=csv");

frontend/server/src/test/java/org/pytorch/serve/util/ConfigManagerTest.java

+6-1
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,13 @@ public void testNumGpuM1() throws ReflectiveOperationException, IOException {
115115
ConfigManager.init(args);
116116
ConfigManager configManager = ConfigManager.getInstance();
117117
String arch = System.getProperty("os.arch");
118+
String mac_arm64_cpu_only = System.getenv().getOrDefault("TS_MAC_ARM64_CPU_ONLY", "False");
118119
if (arch.equals("aarch64")) {
119-
Assert.assertTrue(configManager.getNumberOfGpu() > 0);
120+
if (mac_arm64_cpu_only.equals("True")) {
121+
Assert.assertEquals(configManager.getNumberOfGpu(), 0);
122+
} else {
123+
Assert.assertTrue(configManager.getNumberOfGpu() > 0);
124+
}
120125
}
121126
}
122127
}

test/pytest/test_device_config.py

+52-6
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,20 @@
1919
mnist_scriptes_py = os.path.join(REPO_ROOT, "examples/image_classifier/mnist/mnist.py")
2020

2121
HANDLER_PY = """
22+
import torch
23+
from ts.torch_handler.base_handler import BaseHandler
24+
25+
class deviceHandler(BaseHandler):
26+
27+
def initialize(self, context):
28+
super().initialize(context)
29+
if torch.backends.mps.is_available() and context.system_properties.get("gpu_id") is not None:
30+
assert self.get_device().type == "mps"
31+
else:
32+
assert self.get_device().type == "cpu"
33+
"""
34+
35+
HANDLER_PY_GPU = """
2236
from ts.torch_handler.base_handler import BaseHandler
2337
2438
class deviceHandler(BaseHandler):
@@ -28,6 +42,16 @@ def initialize(self, context):
2842
assert self.get_device().type == "mps"
2943
"""
3044

45+
HANDLER_PY_CPU = """
46+
from ts.torch_handler.base_handler import BaseHandler
47+
48+
class deviceHandler(BaseHandler):
49+
50+
def initialize(self, context):
51+
super().initialize(context)
52+
assert self.get_device().type == "cpu"
53+
"""
54+
3155
MODEL_CONFIG_YAML = """
3256
#frontend settings
3357
# TorchServe frontend parameters
@@ -78,8 +102,23 @@ def get_config(param):
78102
return get_config(request.param)
79103

80104

105+
@pytest.fixture(scope="module")
106+
def handler_py(request):
107+
def get_handler(param):
108+
if param == "cpu":
109+
return HANDLER_PY_CPU
110+
elif param == "gpu":
111+
return HANDLER_PY_GPU
112+
else:
113+
return HANDLER_PY
114+
115+
return get_handler(request.param)
116+
117+
81118
@pytest.fixture(scope="module", name="mar_file_path")
82-
def create_mar_file(work_dir, model_archiver, model_name, model_config_name):
119+
def create_mar_file(
120+
work_dir, model_archiver, model_name, model_config_name, handler_py
121+
):
83122
mar_file_path = work_dir.joinpath(model_name + ".mar")
84123

85124
model_config_yaml_file = work_dir / "model_config.yaml"
@@ -90,7 +129,7 @@ def create_mar_file(work_dir, model_archiver, model_name, model_config_name):
90129
model_py_file.write_text(mnist_scriptes_py)
91130

92131
handler_py_file = work_dir / "handler.py"
93-
handler_py_file.write_text(HANDLER_PY)
132+
handler_py_file.write_text(handler_py)
94133

95134
config = ModelArchiverConfig(
96135
model_name=model_name,
@@ -147,22 +186,29 @@ def register_model(mar_file_path, model_store, torchserve):
147186
test_utils.unregister_model(model_name)
148187

149188

150-
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
189+
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
190+
@pytest.mark.skipif(
191+
os.environ.get("TS_MAC_ARM64_CPU_ONLY", "False") == "True",
192+
reason="Skip if running only on MAC CPU",
193+
)
151194
@pytest.mark.parametrize("model_config_name", ["gpu"], indirect=True)
195+
@pytest.mark.parametrize("handler_py", ["gpu"], indirect=True)
152196
def test_m1_device(model_name, model_config_name):
153197
response = requests.get(f"http://localhost:8081/models/{model_name}")
154198
assert response.status_code == 200, "Describe Failed"
155199

156200

157-
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
201+
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
158202
@pytest.mark.parametrize("model_config_name", ["cpu"], indirect=True)
203+
@pytest.mark.parametrize("handler_py", ["cpu"], indirect=True)
159204
def test_m1_device_cpu(model_name, model_config_name):
160205
response = requests.get(f"http://localhost:8081/models/{model_name}")
161-
assert response.status_code == 404, "Describe Worked"
206+
assert response.status_code == 200, "Describe Failed"
162207

163208

164-
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on Mac M1")
209+
@pytest.mark.skipif(platform.machine() != "arm64", reason="Skip on non Mac M1")
165210
@pytest.mark.parametrize("model_config_name", ["default"], indirect=True)
211+
@pytest.mark.parametrize("handler_py", ["default"], indirect=True)
166212
def test_m1_device_default(model_name, model_config_name):
167213
response = requests.get(f"http://localhost:8081/models/{model_name}")
168214
assert response.status_code == 200, "Describe Failed"

0 commit comments

Comments
 (0)