From 12a955d54208f66c5ebaab3ff903f875d06bcd1c Mon Sep 17 00:00:00 2001
From: Shixian Cui <shixian@amazon.com>
Date: Mon, 25 Nov 2024 20:36:21 +0000
Subject: [PATCH] Add AWS Inf2 instances support for aws_batch scheduler

---
 torchx/specs/named_resources_aws.py           | 44 +++++++++++++++++++
 torchx/specs/test/named_resources_aws_test.py | 29 ++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/torchx/specs/named_resources_aws.py b/torchx/specs/named_resources_aws.py
index cbd69988b..bdd2b9562 100644
--- a/torchx/specs/named_resources_aws.py
+++ b/torchx/specs/named_resources_aws.py
@@ -274,6 +274,46 @@ def aws_trn1_32xlarge() -> Resource:
     )
 
 
+def aws_inf2_xlarge() -> Resource:
+    return Resource(
+        cpu=4,
+        gpu=0,
+        memMB=32 * GiB,
+        capabilities={K8S_ITYPE: "inf2.xlarge"},
+        devices={NEURON_DEVICE: 1},
+    )
+
+
+def aws_inf2_8xlarge() -> Resource:
+    return Resource(
+        cpu=32,
+        gpu=0,
+        memMB=32 * GiB,
+        capabilities={K8S_ITYPE: "inf2.8xlarge"},
+        devices={NEURON_DEVICE: 1},
+    )
+
+
+def aws_inf2_24xlarge() -> Resource:
+    return Resource(
+        cpu=96,
+        gpu=0,
+        memMB=192 * GiB,
+        capabilities={K8S_ITYPE: "inf2.24xlarge"},
+        devices={NEURON_DEVICE: 6},
+    )
+
+
+def aws_inf2_48xlarge() -> Resource:
+    return Resource(
+        cpu=192,
+        gpu=0,
+        memMB=384 * GiB,
+        capabilities={K8S_ITYPE: "inf2.48xlarge"},
+        devices={NEURON_DEVICE: 12},
+    )
+
+
 NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
     "aws_t3.medium": aws_t3_medium,
     "aws_m5.2xlarge": aws_m5_2xlarge,
@@ -301,4 +341,8 @@ def aws_trn1_32xlarge() -> Resource:
     "aws_g5.48xlarge": aws_g5_48xlarge,
     "aws_trn1.2xlarge": aws_trn1_2xlarge,
     "aws_trn1.32xlarge": aws_trn1_32xlarge,
+    "aws_inf2.xlarge": aws_inf2_xlarge,
+    "aws_inf2.8xlarge": aws_inf2_8xlarge,
+    "aws_inf2.24xlarge": aws_inf2_24xlarge,
+    "aws_inf2.48xlarge": aws_inf2_48xlarge,
 }
diff --git a/torchx/specs/test/named_resources_aws_test.py b/torchx/specs/test/named_resources_aws_test.py
index 064483f8f..6a6ec5ef2 100644
--- a/torchx/specs/test/named_resources_aws_test.py
+++ b/torchx/specs/test/named_resources_aws_test.py
@@ -23,6 +23,10 @@
     aws_g5_4xlarge,
     aws_g5_8xlarge,
     aws_g5_xlarge,
+    aws_inf2_24xlarge,
+    aws_inf2_48xlarge,
+    aws_inf2_8xlarge,
+    aws_inf2_xlarge,
     aws_m5_2xlarge,
     aws_p3_16xlarge,
     aws_p3_2xlarge,
@@ -179,6 +183,31 @@ def test_aws_trn1(self) -> None:
         self.assertEqual(trn1_32.memMB, trn1_2.memMB * 16)
         self.assertEqual({EFA_DEVICE: 8, NEURON_DEVICE: 16}, trn1_32.devices)
 
+    def test_aws_inf2(self) -> None:
+        inf2_1 = aws_inf2_xlarge()
+        self.assertEqual(4, inf2_1.cpu)
+        self.assertEqual(0, inf2_1.gpu)
+        self.assertEqual(32 * GiB, inf2_1.memMB)
+        self.assertEqual({NEURON_DEVICE: 1}, inf2_1.devices)
+
+        inf2_8 = aws_inf2_8xlarge()
+        self.assertEqual(32, inf2_8.cpu)
+        self.assertEqual(0, inf2_8.gpu)
+        self.assertEqual(32 * GiB, inf2_8.memMB)
+        self.assertEqual({NEURON_DEVICE: 1}, inf2_8.devices)
+
+        inf2_24 = aws_inf2_24xlarge()
+        self.assertEqual(96, inf2_24.cpu)
+        self.assertEqual(0, inf2_24.gpu)
+        self.assertEqual(192 * GiB, inf2_24.memMB)
+        self.assertEqual({NEURON_DEVICE: 6}, inf2_24.devices)
+
+        inf2_48 = aws_inf2_48xlarge()
+        self.assertEqual(192, inf2_48.cpu)
+        self.assertEqual(0, inf2_48.gpu)
+        self.assertEqual(384 * GiB, inf2_48.memMB)
+        self.assertEqual({NEURON_DEVICE: 12}, inf2_48.devices)
+
     def test_aws_m5_2xlarge(self) -> None:
         resource = aws_m5_2xlarge()
         self.assertEqual(8, resource.cpu)