Skip to content

Commit a387c4c

Browse files
fix: gpu integs CapacityError - fallback to available compute (#3004)
Co-authored-by: Ahsan Khan <[email protected]>
1 parent cbae73f commit a387c4c

File tree

4 files changed

+82
-44
lines changed

4 files changed

+82
-44
lines changed

.readthedocs.yml .readthedocs.yaml

+7-1
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,19 @@
44

55
version: 2
66

7+
build:
8+
os: ubuntu-20.04
9+
tools:
10+
python: "3.9"
11+
12+
713
python:
8-
version: 3.9
914
install:
1015
- method: pip
1116
path: .
1217
- requirements: doc/requirements.txt
1318

19+
1420
sphinx:
1521
configuration: doc/conf.py
1622
fail_on_warning: true # http://www.sphinx-doc.org/en/master/man/sphinx-build.html#id6

tests/conftest.py

+9
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,15 @@ def gpu_instance_type(sagemaker_session, request):
405405
return "ml.p3.2xlarge"
406406

407407

408+
@pytest.fixture(scope="session")
409+
def gpu_instance_type_list(sagemaker_session, request):
410+
region = sagemaker_session.boto_session.region_name
411+
if region in NO_P3_REGIONS:
412+
return ["ml.p2.xlarge"]
413+
else:
414+
return ["ml.p3.2xlarge", "ml.p2.xlarge"]
415+
416+
408417
@pytest.fixture(scope="session")
409418
def inf_instance_type(sagemaker_session, request):
410419
return "ml.inf1.xlarge"

tests/integ/test_huggingface.py

+33-22
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@
1515
import os
1616

1717
import pytest
18+
import logging
1819

1920
from sagemaker.huggingface import HuggingFace, HuggingFaceProcessor
2021
from sagemaker.huggingface.model import HuggingFaceModel, HuggingFacePredictor
2122
from sagemaker.utils import unique_name_from_base
2223
from tests import integ
2324
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2425
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
26+
from sagemaker.exceptions import UnexpectedStatusException
2527

2628
ROLE = "SageMakerRole"
2729

@@ -34,32 +36,41 @@
3436
)
3537
def test_framework_processing_job_with_deps(
3638
sagemaker_session,
37-
gpu_instance_type,
39+
gpu_instance_type_list,
3840
huggingface_training_latest_version,
3941
huggingface_training_pytorch_latest_version,
4042
huggingface_pytorch_latest_training_py_version,
4143
):
42-
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
43-
code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
44-
entry_point = "main_script.py"
45-
46-
processor = HuggingFaceProcessor(
47-
transformers_version=huggingface_training_latest_version,
48-
pytorch_version=huggingface_training_pytorch_latest_version,
49-
py_version=huggingface_pytorch_latest_training_py_version,
50-
role=ROLE,
51-
instance_count=1,
52-
instance_type=gpu_instance_type,
53-
sagemaker_session=sagemaker_session,
54-
base_job_name="test-huggingface",
55-
)
56-
57-
processor.run(
58-
code=entry_point,
59-
source_dir=code_path,
60-
inputs=[],
61-
wait=True,
62-
)
44+
for i_type in gpu_instance_type_list:
45+
logging.info("Using the instance type: {}".format(i_type))
46+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
47+
code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
48+
entry_point = "main_script.py"
49+
50+
processor = HuggingFaceProcessor(
51+
transformers_version=huggingface_training_latest_version,
52+
pytorch_version=huggingface_training_pytorch_latest_version,
53+
py_version=huggingface_pytorch_latest_training_py_version,
54+
role=ROLE,
55+
instance_count=1,
56+
instance_type=i_type,
57+
sagemaker_session=sagemaker_session,
58+
base_job_name="test-huggingface",
59+
)
60+
try:
61+
processor.run(
62+
code=entry_point,
63+
source_dir=code_path,
64+
inputs=[],
65+
wait=True,
66+
)
67+
except UnexpectedStatusException as e:
68+
if "CapacityError" in str(e) and i_type != gpu_instance_type_list[-1]:
69+
logging.warning("Failure using instance type: {}. {}".format(i_type, str(e)))
70+
continue
71+
else:
72+
raise
73+
break
6374

6475

6576
@pytest.mark.release

tests/integ/test_tf.py

+33-21
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import numpy as np
1616
import os
1717
import time
18+
import logging
1819

1920
import pytest
2021

@@ -25,6 +26,8 @@
2526
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES, kms_utils, timeout
2627
from tests.integ.retry import retries
2728
from tests.integ.s3_utils import assert_s3_file_patterns_exist
29+
from sagemaker.exceptions import UnexpectedStatusException
30+
2831

2932
ROLE = "SageMakerRole"
3033

@@ -42,30 +45,39 @@
4245
@pytest.mark.release
4346
def test_framework_processing_job_with_deps(
4447
sagemaker_session,
45-
instance_type,
48+
gpu_instance_type_list,
4649
tensorflow_training_latest_version,
4750
tensorflow_training_latest_py_version,
4851
):
49-
with timeout.timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
50-
code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
51-
entry_point = "main_script.py"
52-
53-
processor = TensorFlowProcessor(
54-
framework_version=tensorflow_training_latest_version,
55-
py_version=tensorflow_training_latest_py_version,
56-
role=ROLE,
57-
instance_count=1,
58-
instance_type=instance_type,
59-
sagemaker_session=sagemaker_session,
60-
base_job_name="test-tensorflow",
61-
)
62-
63-
processor.run(
64-
code=entry_point,
65-
source_dir=code_path,
66-
inputs=[],
67-
wait=True,
68-
)
52+
for i_type in gpu_instance_type_list:
53+
logging.info("Using the instance type: {}".format(i_type))
54+
with timeout.timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
55+
code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
56+
entry_point = "main_script.py"
57+
58+
processor = TensorFlowProcessor(
59+
framework_version=tensorflow_training_latest_version,
60+
py_version=tensorflow_training_latest_py_version,
61+
role=ROLE,
62+
instance_count=1,
63+
instance_type=i_type,
64+
sagemaker_session=sagemaker_session,
65+
base_job_name="test-tensorflow",
66+
)
67+
try:
68+
processor.run(
69+
code=entry_point,
70+
source_dir=code_path,
71+
inputs=[],
72+
wait=True,
73+
)
74+
except UnexpectedStatusException as e:
75+
if "CapacityError" in str(e) and i_type != gpu_instance_type_list[-1]:
76+
logging.warning("Failure using instance type: {}. {}".format(i_type, str(e)))
77+
continue
78+
else:
79+
raise
80+
break
6981

7082

7183
def test_mnist_with_checkpoint_config(

0 commit comments

Comments
 (0)