From 12a955d54208f66c5ebaab3ff903f875d06bcd1c Mon Sep 17 00:00:00 2001 From: Shixian Cui Date: Mon, 25 Nov 2024 20:36:21 +0000 Subject: [PATCH] Add AWS Inf2 instances support for aws_batch scheduler --- torchx/specs/named_resources_aws.py | 44 +++++++++++++++++++ torchx/specs/test/named_resources_aws_test.py | 29 ++++++++++++ 2 files changed, 73 insertions(+) diff --git a/torchx/specs/named_resources_aws.py b/torchx/specs/named_resources_aws.py index cbd69988b..bdd2b9562 100644 --- a/torchx/specs/named_resources_aws.py +++ b/torchx/specs/named_resources_aws.py @@ -274,6 +274,46 @@ def aws_trn1_32xlarge() -> Resource: ) +def aws_inf2_xlarge() -> Resource: + return Resource( + cpu=4, + gpu=0, + memMB=32 * GiB, + capabilities={K8S_ITYPE: "inf2.xlarge"}, + devices={NEURON_DEVICE: 1}, + ) + + +def aws_inf2_8xlarge() -> Resource: + return Resource( + cpu=32, + gpu=0, + memMB=32 * GiB, + capabilities={K8S_ITYPE: "inf2.8xlarge"}, + devices={NEURON_DEVICE: 1}, + ) + + +def aws_inf2_24xlarge() -> Resource: + return Resource( + cpu=96, + gpu=0, + memMB=192 * GiB, + capabilities={K8S_ITYPE: "inf2.24xlarge"}, + devices={NEURON_DEVICE: 6}, + ) + + +def aws_inf2_48xlarge() -> Resource: + return Resource( + cpu=192, + gpu=0, + memMB=384 * GiB, + capabilities={K8S_ITYPE: "inf2.48xlarge"}, + devices={NEURON_DEVICE: 12}, + ) + + NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = { "aws_t3.medium": aws_t3_medium, "aws_m5.2xlarge": aws_m5_2xlarge, @@ -301,4 +341,8 @@ def aws_trn1_32xlarge() -> Resource: "aws_g5.48xlarge": aws_g5_48xlarge, "aws_trn1.2xlarge": aws_trn1_2xlarge, "aws_trn1.32xlarge": aws_trn1_32xlarge, + "aws_inf2.xlarge": aws_inf2_xlarge, + "aws_inf2.8xlarge": aws_inf2_8xlarge, + "aws_inf2.24xlarge": aws_inf2_24xlarge, + "aws_inf2.48xlarge": aws_inf2_48xlarge, } diff --git a/torchx/specs/test/named_resources_aws_test.py b/torchx/specs/test/named_resources_aws_test.py index 064483f8f..6a6ec5ef2 100644 --- a/torchx/specs/test/named_resources_aws_test.py +++ b/torchx/specs/test/named_resources_aws_test.py @@ -23,6 +23,10 @@ aws_g5_4xlarge, aws_g5_8xlarge, aws_g5_xlarge, + aws_inf2_24xlarge, + aws_inf2_48xlarge, + aws_inf2_8xlarge, + aws_inf2_xlarge, aws_m5_2xlarge, aws_p3_16xlarge, aws_p3_2xlarge, @@ -179,6 +183,31 @@ def test_aws_trn1(self) -> None: self.assertEqual(trn1_32.memMB, trn1_2.memMB * 16) self.assertEqual({EFA_DEVICE: 8, NEURON_DEVICE: 16}, trn1_32.devices) + def test_aws_inf2(self) -> None: + inf2_1 = aws_inf2_xlarge() + self.assertEqual(4, inf2_1.cpu) + self.assertEqual(0, inf2_1.gpu) + self.assertEqual(32 * GiB, inf2_1.memMB) + self.assertEqual({NEURON_DEVICE: 1}, inf2_1.devices) + + inf2_8 = aws_inf2_8xlarge() + self.assertEqual(32, inf2_8.cpu) + self.assertEqual(0, inf2_8.gpu) + self.assertEqual(32 * GiB, inf2_8.memMB) + self.assertEqual({NEURON_DEVICE: 1}, inf2_8.devices) + + inf2_24 = aws_inf2_24xlarge() + self.assertEqual(96, inf2_24.cpu) + self.assertEqual(0, inf2_24.gpu) + self.assertEqual(192 * GiB, inf2_24.memMB) + self.assertEqual({NEURON_DEVICE: 6}, inf2_24.devices) + + inf2_48 = aws_inf2_48xlarge() + self.assertEqual(192, inf2_48.cpu) + self.assertEqual(0, inf2_48.gpu) + self.assertEqual(384 * GiB, inf2_48.memMB) + self.assertEqual({NEURON_DEVICE: 12}, inf2_48.devices) + def test_aws_m5_2xlarge(self) -> None: resource = aws_m5_2xlarge() self.assertEqual(8, resource.cpu)