[Feat] amdsmi bindings integration (#132)

Co-authored-by: Jae-Won Chung <[email protected]>
ml-energy · Nov 15, 2024 · 0539e7e · 0539e7e
1 parent 6b2cdc1
commit 0539e7e
Show file tree

Hide file tree

Showing 5 changed files with 133 additions and 28 deletions.
diff --git a/docs/measure/index.md b/docs/measure/index.md
@@ -149,3 +149,23 @@ Total time (s): 4.421529293060303
 Total energy (J):
 {'GPU0': 198.52566362297537, 'GPU1': 206.22215216255188, 'GPU2': 201.08565518283845, 'GPU3': 201.79834523367884}
 ```
+
+## Hardware Support
+We currently support both NVIDIA (via NVML) and AMD GPUs (via AMDSMI, with ROCm 6.1 or later).
+
+### `get_gpus`
+The [`get_gpus`][zeus.device.get_gpus] function returns a [`GPUs`][zeus.device.gpu.GPUs] object, which can be either an [`NVIDIAGPUs`][zeus.device.gpu.NVIDIAGPUs] or [`AMDGPUs`][zeus.device.gpu.AMDGPUs] object depending on the availability of `nvml` or `amdsmi`. Each [`GPUs`][zeus.device.gpu.GPUs] object contains one or more [`GPU`][zeus.device.gpu.common.GPU] instances, which are specifically [`NVIDIAGPU`][zeus.device.gpu.nvidia.NVIDIAGPU] or [`AMDGPU`][zeus.device.gpu.amd.AMDGPU] objects.
+
+These [`GPU`][zeus.device.gpu.common.GPU] objects directly call respective `nvml` or `amdsmi` methods, providing a one-to-one mapping of methods for seamless GPU abstraction and support for multiple GPU types. For example:
+- [`NVIDIAGPU.getName`][zeus.device.gpu.nvidia.NVIDIAGPU.getName] calls `pynvml.nvmlDeviceGetName`.
+- [`AMDGPU.getName`][zeus.device.gpu.amd.AMDGPU.getName] calls `amdsmi.amdsmi_get_gpu_asic_info`.
+
+### Notes on AMD GPUs
+
+#### AMD GPUs Initialization
+`amdsmi.amdsmi_get_energy_count` sometimes returns invalid values on certain GPUs or ROCm versions (e.g., MI100 on ROCm 6.2). See [ROCm issue #38](https://github.com/ROCm/amdsmi/issues/38) for more details. During the [`AMDGPUs`][zeus.device.gpu.AMDGPUs] object initialization, we call `amdsmi.amdsmi_get_energy_count` twice for each GPU, with a 0.5-second delay between calls. This difference is compared to power measurements to determine if `amdsmi.amdsmi_get_energy_count` is stable and reliable. Initialization takes 0.5 seconds regardless of the number of AMD GPUs.
+
+`amdsmi.amdsmi_get_power_info` provides "average_socket_power" and "current_socket_power" fields, but the "current_socket_power" field is sometimes not supported and returns "N/A." During the [`AMDGPUs`][zeus.device.gpu.AMDGPUs] object initialization, this method is checked, and if "N/A" is returned, the [`AMDGPU.getInstantPowerUsage`][zeus.device.gpu.amd.AMDGPU.getInstantPowerUsage] method is disabled. Instead, [`AMDGPU.getAveragePowerUsage`][zeus.device.gpu.amd.AMDGPU.getAveragePowerUsage] needs to be used.
+
+#### Supported AMD SMI Versions
+Only ROCm >= 6.1 is supported, as the AMDSMI APIs for power and energy return wrong values. For more information, see [ROCm issue #22](https://github.com/ROCm/amdsmi/issues/22). Ensure your `amdsmi` and ROCm versions are up to date.
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,8 @@ dependencies = [
     "pydantic",  # The `zeus.utils.pydantic_v1` compatibility layer allows us to unpin Pydantic in most cases.
     "rich",
     "tyro",
-    "httpx"
+    "httpx",
+    "amdsmi"
 ]
 dynamic = ["version"]
 

diff --git a/zeus/device/gpu/amd.py b/zeus/device/gpu/amd.py
@@ -4,12 +4,15 @@
 import functools
 import os
 import contextlib
+import time
 from typing import Sequence
 from functools import lru_cache
 
 try:
     import amdsmi  # type: ignore
-except ImportError:
+# must catch all exceptions, since ImportError is not the only exception that can be raised (ex. OSError on version mismatch).
+# Specific exceptions are handled when import and initialization are retested in `amdsmi_is_available`
+except Exception:
 
     class MockAMDSMI:
         """Mock class for AMD SMI library."""
@@ -41,6 +44,18 @@ def amdsmi_is_available() -> bool:
     except ImportError:
         logger.info("amdsmi is not available.")
         return False
+    # usually thrown if amdsmi can't find libamd_smi.so
+    except OSError:
+        if os.getenv("ROCM_PATH") is None:
+            logger.warning("`ROCM_PATH` is not set. Do you have ROCm installed?")
+        return False
+    # usually thrown if versions of amdsmi and ROCm are incompatible.
+    except AttributeError:
+        logger.warning(
+            "Failed to import amdsmi. "
+            "Ensure amdsmi's version is at least as high as the current ROCm version."
+        )
+        return False
     try:
         amdsmi.amdsmi_init()
         logger.info("amdsmi is available and initialized")
@@ -71,10 +86,10 @@ def __init__(self, gpu_index: int) -> None:
         """Initialize the GPU object."""
         super().__init__(gpu_index)
         self._get_handle()
-        # XXX(Jae-Won): Right now, the energy API's unit is broken (either the
-        # `power` field or the `counter_resolution` field). Before that, we're
-        # disabling the energy API.
-        self._supportsGetTotalEnergyConsumption = False
+
+        # These values are updated in AMDGPUs constructor
+        self._supportsGetTotalEnergyConsumption = True
+        self._supportsInstantPowerUsage = True
 
     _exception_map = {
         1: gpu_common.ZeusGPUInvalidArgError,  # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL
@@ -225,12 +240,28 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None:
             clk_type=amdsmi.AmdSmiClkType.GFX,
         )  # expects MHz
 
+    @_handle_amdsmi_errors
+    def getAveragePowerUsage(self) -> int:
+        """Return the average power draw of the GPU. Units: mW."""
+        # returns in W, convert to mW
+        return (
+            int(amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"])
+            * 1000
+        )
+
     @_handle_amdsmi_errors
     def getInstantPowerUsage(self) -> int:
         """Return the current power draw of the GPU. Units: mW."""
+        if not self._supportsInstantPowerUsage:
+            raise gpu_common.ZeusGPUNotSupportedError(
+                "Instant power usage is not supported on this AMD GPU. "
+                "This is because amdsmi.amdsmi_get_power_info does not return a valid 'current_socket_power'. "
+                "Please use `getAveragePowerUsage` instead."
+            )
         # returns in W, convert to mW
-        return int(
-            amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"] * 1000
+        return (
+            int(amdsmi.amdsmi_get_power_info(self.handle)["current_socket_power"])
+            * 1000
         )
 
     @_handle_amdsmi_errors
@@ -242,28 +273,28 @@ def getAverageMemoryPowerUsage(self) -> int:
 
     @_handle_amdsmi_errors
     def supportsGetTotalEnergyConsumption(self) -> bool:
-        """Check if the GPU supports retrieving total energy consumption."""
-        if self._supportsGetTotalEnergyConsumption is None:
-            try:
-                _ = amdsmi.amdsmi_get_energy_count(self.handle)
-                self._supportsGetTotalEnergyConsumption = True
-            except amdsmi.AmdSmiLibraryException as e:
-                if (
-                    e.get_error_code() == 2
-                ):  # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED
-                    self._supportsGetTotalEnergyConsumption = False
-                else:
-                    raise e
-
+        """Check if the GPU supports retrieving total energy consumption. Returns a future object of the result."""
         return self._supportsGetTotalEnergyConsumption
 
     @_handle_amdsmi_errors
     def getTotalEnergyConsumption(self) -> int:
         """Return the total energy consumption of the GPU since driver load. Units: mJ."""
-        info = amdsmi.amdsmi_get_energy_count(self.handle)
-        return int(
-            info["power"] / 1e3
-        )  # returns in micro Joules, convert to mili Joules
+        if not self._supportsGetTotalEnergyConsumption:
+            raise gpu_common.ZeusGPUNotSupportedError(
+                "Total energy consumption is not supported on this AMD GPU. "
+                "This is because the result of `amdsmi.amdsmi_get_energy_count` is not accurate. "
+                "Please use `getAveragePowerUsage` or `getInstantPowerUsage` to calculate energy usage."
+            )
+        energy_dict = amdsmi.amdsmi_get_energy_count(self.handle)
+        if "energy_accumulator" in energy_dict:  # Changed since amdsmi 6.2.1
+            energy = (
+                energy_dict["energy_accumulator"] * energy_dict["counter_resolution"]
+            )
+        else:
+            # Old API: assume has key "power". If not, exception will be handled by _handle_amdsmi_errors.
+            energy = energy_dict["power"] * energy_dict["counter_resolution"]
+
+        return int(energy / 1e3)  # returns in micro Joules, convert to mili Joules
 
 
 class AMDGPUs(gpu_common.GPUs):
@@ -292,11 +323,11 @@ def __init__(self, ensure_homogeneous: bool = False) -> None:
             self._init_gpus()
             if ensure_homogeneous:
                 self._ensure_homogeneous()
-        except amdsmi.AmdSmiException as e:
+        except amdsmi.AmdSmiLibraryException as e:
             exception_class = AMDGPU._exception_map.get(
-                e.value, gpu_common.ZeusBaseGPUError
+                e.get_error_code(), gpu_common.ZeusBaseGPUError
             )
-            raise exception_class(e.msg) from e
+            raise exception_class(e.get_error_info()) from e
 
     @property
     def gpus(self) -> Sequence[AMDGPU]:
@@ -318,8 +349,46 @@ def _init_gpus(self) -> None:
         else:
             visible_indices = list(range(len(amdsmi.amdsmi_get_processor_handles())))
 
+        # create the number of visible GPUs
         self._gpus = [AMDGPU(gpu_num) for gpu_num in visible_indices]
 
+        # set _supportsInstantPowerUsage for all GPUs
+        for gpu in self._gpus:
+            gpu._supportsInstantPowerUsage = isinstance(
+                amdsmi.amdsmi_get_power_info(gpu.handle)["current_socket_power"],
+                int,
+            )  # amdsmi.amdsmi_get_power_info["current_socket_power"] returns "N/A" if not supported
+
+        # set _supportsGetTotalEnergyConsumption for all GPUs
+        wait_time = 0.5  # seconds
+        powers = [gpu.getAveragePowerUsage() for gpu in self._gpus]
+        initial_energies = [gpu.getTotalEnergyConsumption() for gpu in self._gpus]
+        time.sleep(wait_time)
+        final_energies = [gpu.getTotalEnergyConsumption() for gpu in self._gpus]
+        measured_energies = [
+            final - initial for final, initial in zip(final_energies, initial_energies)
+        ]
+        expected_energies = [
+            power * wait_time for power in powers
+        ]  # energy = power * time
+
+        for gpu, measured_energy, expected_energy in zip(
+            self._gpus, measured_energies, expected_energies
+        ):
+            # Loose bound to rule out very obvious counter problems
+            if 0.1 < measured_energy / expected_energy < 10:
+                gpu._supportsGetTotalEnergyConsumption = True
+            else:
+                gpu._supportsGetTotalEnergyConsumption = False
+                logger.info(
+                    "Disabling `getTotalEnergyConsumption` for device %d. The result of `amdsmi.amdsmi_get_energy_count` is not accurate. Expected energy: %d mJ, Measured energy: %d mJ. "
+                    "This is a known issue with some AMD GPUs, please see https://github.com/ROCm/amdsmi/issues/38 for more information. "
+                    "You can still measure energy by polling either `getInstantPowerUsage` or `getAveragePowerUsage` and integrating over time.",
+                    gpu.gpu_index,
+                    expected_energy,
+                    measured_energy,
+                )
+
     def __del__(self) -> None:
         """Shut down AMDSMI."""
         with contextlib.suppress(amdsmi.AmdSmiException):

diff --git a/zeus/device/gpu/common.py b/zeus/device/gpu/common.py
@@ -96,6 +96,11 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None:
         """Reset the locked GPU clocks to the default."""
         pass
 
+    @abc.abstractmethod
+    def getAveragePowerUsage(self) -> int:
+        """Return the average power usage of the GPU. Units: mW."""
+        pass
+
     @abc.abstractmethod
     def getInstantPowerUsage(self) -> int:
         """Return the current power draw of the GPU. Units: mW."""

diff --git a/zeus/device/gpu/nvidia.py b/zeus/device/gpu/nvidia.py
@@ -189,6 +189,16 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None:
         """Reset the locked GPU clocks to the default."""
         pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)
 
+    @_handle_nvml_errors
+    def getAveragePowerUsage(self) -> int:
+        """Return the average power draw of the GPU. Units: mW."""
+        metric = pynvml.nvmlDeviceGetFieldValues(
+            self.handle, [pynvml.NVML_FI_DEV_POWER_AVERAGE]
+        )[0]
+        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
+            raise pynvml.NVMLError(ret)
+        return metric.value.uiVal
+
     @_handle_nvml_errors
     def getInstantPowerUsage(self) -> int:
         """Return the current power draw of the GPU. Units: mW."""